diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d96c339dadc7..d874b21b0873d 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License -cmake_minimum_required(VERSION 3.15) +cmake_minimum_required(VERSION 3.10) cmake_policy(VERSION 3.10) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) @@ -22,9 +22,6 @@ include(system) project(paddle CXX C) -include(init) -include(generic) # simplify cmake module - # enable language CUDA # TODO(Shibo Tao): remove find_package(CUDA) completely. find_package(CUDA QUIET) @@ -33,16 +30,24 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) -# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON +option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) +# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON # to develop some acl related functionality on x86 option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) +# Note(zhouwei): It use option above, so put here +include(init) +include(generic) # simplify cmake module + if (WITH_GPU AND WITH_XPU) message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() if (WITH_GPU AND WITH_ASCEND) message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time") endif() +if (WITH_GPU AND WITH_ROCM) + message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time") +endif() if(WITH_GPU AND NOT APPLE) enable_language(CUDA) @@ -61,7 +66,7 @@ if(WITH_MUSL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") endif() -if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) +if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() @@ -99,9 +104,11 @@ if(WIN32) endif() endforeach(flag_var) endif() - - # NOTE(Avin0323): Less parallel count result in faster compilation. - math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") + + # NOTE(zhouwei25): temporarily change MP to 1 for reducing CPU & memory utilization + set(PROCESS_MAX 1) + #math(EXPR PROCESS_MAX "${CPU_CORES} * 1 / 2") + # windows build turn off warnings, use parallel compiling. foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE @@ -129,6 +136,9 @@ if(WIN32) foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS) set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221") + if(MSVC_STATIC_CRT) + set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB") + endif() endforeach(flag_var) if (WITH_WIN_DUMP_DBG) @@ -168,8 +178,6 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(ON_INFER "Turn on inference optimization and inference-lib generation" OFF) ################################ Internal Configurations ####################################### -option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) -option(WITH_RCCL "Compile PaddlePaddle with RCCL support" OFF) option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) @@ -180,14 +188,15 @@ option(WITH_PSLIB "Compile with pslib support" OFF) option(WITH_BOX_PS "Compile with box_ps support" OFF) option(WITH_XBYAK "Compile with xbyak support" ON) option(WITH_CONTRIB "Compile the third-party contributation" OFF) -option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE}) +option(WITH_HETERPS "Compile with heterps" OFF}) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) +option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON) option(WITH_ARM "Compile PaddlePaddle with arm support" OFF) @@ -195,6 +204,7 @@ option(WITH_SW "Compile PaddlePaddle with sw support" OFF) option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF) option(WITH_MUSL "Compile with musl libc instead of gblic" OFF) option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF) +option(WITH_STRIP "Strip so files of Whl packages" OFF) # PY_VERSION if(NOT PY_VERSION) @@ -255,9 +265,6 @@ endif() if(WITH_BRPC_RDMA) message(STATUS "Use brpc with rdma.") - if(WITH_GRPC) - message(FATAL_ERROR "Can't use grpc with brpc rdma.") - endif() if(NOT WITH_DISTRIBUTE) message(FATAL_ERROR "Can't use brpc rdma in no distribute env.") endif() @@ -305,9 +312,9 @@ endif(WITH_ROCM) if (NOT WITH_ROCM AND WITH_RCCL) MESSAGE(WARNING - "Disable RCCL when compiling without GPU. Force WITH_RCCL=OFF.") - set(WITH_NCCL OFF CACHE STRING - "Disable RCCL when compiling without GPU" FORCE) + "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.") + set(WITH_RCCL OFF CACHE STRING + "Disable RCCL when compiling without ROCM" FORCE) endif() if(WITH_RCCL) @@ -362,6 +369,13 @@ else() message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") endif() +if(WITH_STRIP) + find_program(STRIP_PATH strip) + if(NOT STRIP_PATH OR NOT LINUX) + set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE) + endif() +endif() + add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 9f1eb16fcf03f..e7f125269be1f 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -173,10 +173,9 @@ if(WITH_PSCORE) add_definitions(-DPADDLE_WITH_PSCORE) endif() - -if(WITH_GRPC) - add_definitions(-DPADDLE_WITH_GRPC) -endif(WITH_GRPC) +if(WITH_HETERPS) + add_definitions(-DPADDLE_WITH_HETERPS) +endif() if(WITH_BRPC_RDMA) add_definitions(-DPADDLE_WITH_BRPC_RDMA) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index bddd2023b437b..414b2a54be034 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -21,7 +21,13 @@ else() set(ASCEND_DIR /usr/local/Ascend) endif() -if(WITH_ASCEND) +if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h) + # It means CANN 20.2 + + add_definitions(-DPADDLE_WITH_ASCEND_STRING) +endif() + + +if(WITH_ASCEND OR WITH_ASCEND_CL) set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) @@ -43,9 +49,6 @@ if(WITH_ASCEND) set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) - if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) - add_definitions(-DPADDLE_WITH_ASCEND_STRING) - endif() ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) @@ -62,17 +65,23 @@ endif() if(WITH_ASCEND_CL) set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so) set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) - set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include) - message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}") + message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}") message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}") - INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR}) + INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR}) + INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR}) ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) + ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib}) + ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index 2e4a67093dc54..e8db13a694f55 100644 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -32,7 +32,7 @@ cache_third_party(extern_gloo TAG ${GLOO_TAG} DIR GLOO_SOURCE_DIR) -if(WITH_ASCEND) + if(WITH_ASCEND OR WITH_ASCEND_CL) ExternalProject_Add( extern_gloo ${EXTERNAL_PROJECT_LOG_ARGS} diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake deleted file mode 100644 index 536e95c1dc2a4..0000000000000 --- a/cmake/external/grpc.cmake +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -include (ExternalProject) - -SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) -SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) -SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) -SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) - -include(ProcessorCount) -ProcessorCount(NUM_OF_PROCESSOR) - -IF(APPLE) - SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh) - SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install) -ELSE() - SET(GRPC_CFLAGS "-Wno-error -std=c11 ${CLFAGS}") - SET(GRPC_CXXFLAGS "-Wno-error -std=c++11 ${CXXFLAGS}") - SET(BUILD_CMD make CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS} HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) - SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS}) -ENDIF() - -# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them -ExternalProject_Add( - extern_grpc - DEPENDS protobuf zlib - # NOTE(wuyi): - # this package is generated by following steps: - # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git - # 2. git submodule update --init - # 3. keep only zlib, cares, protobuf, boringssl under "third_party", - # checkout and clean other dirs under third_party - # 4. remove .git, and package the directory. - URL http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x_paddle.tar.gz - URL_MD5 f5442d137ddccee252e194b1bc90f98c - PREFIX ${GRPC_SOURCES_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_IN_SOURCE 1 - # NOTE(yuyang18): - # Disable -Werror, otherwise the compile will fail in MacOS. - # It seems that we cannot configure that by make command. - # Just dry run make command and remove `-Werror`, then use a shell to run make commands - BUILD_COMMAND ${BUILD_CMD} - INSTALL_COMMAND ${GRPC_INSTALL_CMD} -) - -ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a") - -ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc++.a") -ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgpr.a") - -ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a") - -include_directories(${GRPC_INCLUDE_DIR}) -ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 884219d8dd81f..fb1d4d9d56dcc 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git) -SET(MKLDNN_TAG 72efa005effb49595933e033cc732f215ef0445a) +SET(MKLDNN_TAG f58682cd8bd0615f41d879f8afc8f1511ab42d24) # Introduce variables: # * CMAKE_INSTALL_LIBDIR diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 82d64fd022883..c108c05368c91 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -242,7 +242,7 @@ endif() ) ENDFUNCTION() -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) SET(PROTOBUF_VERSION 3.8.0) else() SET(PROTOBUF_VERSION 3.1.0) diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake index 0eabdb4e127bd..f9cb3a9075a82 100644 --- a/cmake/external/threadpool.cmake +++ b/cmake/external/threadpool.cmake @@ -16,7 +16,7 @@ INCLUDE(ExternalProject) SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git) else() SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index a4367510ac703..100b915339469 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -43,7 +43,7 @@ cache_third_party(extern_warpctc TAG ${WARPCTC_TAG} DIR WARPCTC_SOURCE_DIR) -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c85654a5674a0..a5c74a46631e9 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -447,9 +447,20 @@ function(cc_test TARGET_NAME) cc_test_build(${TARGET_NAME} SRCS ${cc_test_SRCS} DEPS ${cc_test_DEPS}) - cc_test_run(${TARGET_NAME} - COMMAND ${TARGET_NAME} - ARGS ${cc_test_ARGS}) + # we dont test hcom op, because it need complex configuration + # with more than one machine + if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test")) + cc_test_run(${TARGET_NAME} + COMMAND ${TARGET_NAME} + ARGS ${cc_test_ARGS}) + endif() endif() endfunction(cc_test) @@ -807,7 +818,7 @@ function(py_test TARGET_NAME) ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif() - + if (WIN32) set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150) endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 4864e04fa0516..9694a7bc59c12 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -211,11 +211,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") if(WIN32) set(paddle_inference_c_lib $/paddle_inference_c.*) else(WIN32) - set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_inference_c.*) + set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*) endif(WIN32) copy(inference_lib_dist - SRCS ${src_dir}/inference/capi/paddle_c_api.h ${paddle_inference_c_lib} + SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib} DSTS ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib) # fluid library for both train and inference diff --git a/cmake/init.cmake b/cmake/init.cmake index 19fdb6c601a11..b11156d2e9986 100644 --- a/cmake/init.cmake +++ b/cmake/init.cmake @@ -18,10 +18,10 @@ if(NOT WIN32) set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") else() - # It has not been used now, it can specify CUDA compile flag manualy, + # It can specify CUDA compile flag manualy, # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous # because CUDA will update by nvidia, then error will occur. - # Now, it's used in CUDA:[10.0, 10.2] + # Now, it's only used in VS2015 + CUDA:[10.0, 10.2] set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props) endif() diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props index 296940dc3f50c..3c069bd2981c4 100644 --- a/cmake/paddle_win.props +++ b/cmake/paddle_win.props @@ -88,4 +88,3 @@ set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)" - diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 81fa7d0dfa98f..f90fa3509d63d 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -29,9 +29,9 @@ set(third_party_deps) # 2. REPOSITORY: specify git REPOSITORY of 3rd party # 3. TAG: specify git tag/branch/commitID of 3rd party # 4. DIR: overwrite the original SOURCE_DIR when cache directory -# +# # The function Return 1 PARENT_SCOPE variables: -# - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, +# - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, # and you no longer need to set any donwnload steps in ExternalProject_Add. # For example: # Cache_third_party(${TARGET} @@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET) SET(${TARGET_NAME}_DOWNLOAD_CMD GIT_REPOSITORY ${cache_third_party_REPOSITORY}) IF(cache_third_party_TAG) - LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD + LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD GIT_TAG ${cache_third_party_TAG}) ENDIF() ELSEIF(cache_third_party_URL) @@ -130,7 +130,7 @@ ENDFUNCTION() # Correction of flags on different Platform(WIN/MAC) and Print Warning Message if (APPLE) if(WITH_MKL) - MESSAGE(WARNING + MESSAGE(WARNING "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.") set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE) endif() @@ -141,7 +141,7 @@ if(WIN32 OR APPLE) SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) if(WITH_LIBXSMM) - MESSAGE(WARNING + MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet." "Force WITH_LIBXSMM=OFF") SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE) @@ -276,7 +276,7 @@ endif(WITH_BOX_PS) if(WITH_ASCEND OR WITH_ASCEND_CL) include(external/ascend) - if(WITH_ASCEND) + if(WITH_ASCEND OR WITH_ASCEND_CL) list(APPEND third_party_deps extern_ascend) endif() if(WITH_ASCEND_CL) @@ -290,7 +290,7 @@ if (WITH_PSCORE) include(external/leveldb) list(APPEND third_party_deps extern_leveldb) - + include(external/brpc) list(APPEND third_party_deps extern_brpc) diff --git a/go/demo/mobilenet_c_exp.cc b/go/demo/mobilenet_c_exp.cc new file mode 100644 index 0000000000000..b4f42dab6790b --- /dev/null +++ b/go/demo/mobilenet_c_exp.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include + +void ReadData(float* data, int size); + +int main(int argc, char* argv[]) { + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__"); + PD_ConfigDisableGlogInfo(config); + + PD_Predictor* predictor = PD_PredictorCreate(config); + // config has destroyed in PD_PredictorCreate + config = NULL; + + int input_num = PD_PredictorGetInputNum(predictor); + printf("Input num: %d\n", input_num); + int output_num = PD_PredictorGetOutputNum(predictor); + printf("Output num: %d\n", output_num); + + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* input_tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + PD_OneDimArrayCstrDestroy(input_names); + input_names = NULL; + + int32_t shape[] = {1, 3, 300, 300}; + float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300); // NOLINT + ReadData(data, 1 * 3 * 300 * 300); // NOLINT + PD_TensorReshape(input_tensor, 4, shape); + PD_TensorCopyFromCpuFloat(input_tensor, data); + free(data); + data = NULL; + PD_PredictorRun(predictor); + + PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor* output_tensor = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_OneDimArrayCstrDestroy(output_names); + output_names = nullptr; + + PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor); + int32_t size = 1; + for (size_t index = 0; index < out_shape->size; ++index) { + size = size * out_shape->data[index]; + } + PD_OneDimArrayInt32Destroy(out_shape); + out_shape = NULL; + + data = (float*)malloc(sizeof(float) * size); // NOLINT + PD_TensorCopyToCpuFloat(output_tensor, data); + free(data); + data = NULL; + + PD_TensorDestroy(output_tensor); + output_tensor = NULL; + PD_TensorDestroy(input_tensor); + input_tensor = NULL; + PD_PredictorDestroy(predictor); + predictor = NULL; + + return 0; +} + +void ReadData(float* data, int n) { + FILE* fp = fopen("data/data.txt", "r"); + for (int i = 0; i < n; i++) { + fscanf(fp, "%f", &data[i]); + } + fclose(fp); +} diff --git a/paddle/extension.h b/paddle/extension.h index 71469576853a3..98d4bfd0326c5 100644 --- a/paddle/extension.h +++ b/paddle/extension.h @@ -15,4 +15,4 @@ limitations under the License. */ #pragma once // All paddle apis in C++ frontend -#include "paddle/fluid/extension/include/ext_all.h" +#include "paddle/extension/include/ext_all.h" diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 5a2d7a06201ba..a2062d82c8130 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -14,6 +14,7 @@ endif() add_subdirectory(table) add_subdirectory(service) add_subdirectory(test) +add_subdirectory(index_dataset) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc index 9aafdd769ed4a..dfd55f16e1a06 100644 --- a/paddle/fluid/distributed/fleet.cc +++ b/paddle/fluid/distributed/fleet.cc @@ -146,6 +146,44 @@ void FleetWrapper::CreateClient2ClientConnection() { client2client_max_retry_); } +std::future FleetWrapper::PullSparseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, std::vector* fea_keys, + std::vector>* fea_values, int fea_value_dim) { + fea_keys->clear(); + fea_keys->resize(0); + fea_keys->reserve(MAX_FEASIGN_NUM); + for (auto name : var_names) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + CHECK(tensor != nullptr) << "tensor of var " << name << " is null"; + int64_t* ids = tensor->data(); + size_t len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + if (ids[i] == 0u) { + continue; + } + fea_keys->push_back(static_cast(ids[i])); + } + } + fea_values->resize(fea_keys->size() + 1); + for (auto& t : *fea_values) { + t.resize(fea_value_dim); + } + std::vector pull_result_ptr; + for (auto& t : *fea_values) { + pull_result_ptr.push_back(t.data()); + } + + bool training = true; + return pserver_ptr_->_worker_ptr->pull_sparse(pull_result_ptr.data(), + table_id, fea_keys->data(), + fea_keys->size(), training); +} + void FleetWrapper::PullSparseVarsSync( const Scope& scope, const uint64_t table_id, const std::vector& var_names, std::vector* fea_keys, diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h index 863440180a808..0da5d1e2bf987 100644 --- a/paddle/fluid/distributed/fleet.h +++ b/paddle/fluid/distributed/fleet.h @@ -84,6 +84,15 @@ class FleetWrapper { int fea_dim, const std::vector& var_emb_names); + // Pull sparse variables from server in async mode + // Param: scope, table_id, var_names, fea_keys, fea_dim + // Param: fea_values std::future + std::future PullSparseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector* fea_keys, + std::vector>* fea_values, int fea_dim); + // Pull sparse variables from server in sync mode // pull immediately to tensors // is_training is true means training, false means inference, the behavior is diff --git a/paddle/fluid/distributed/index_dataset/CMakeLists.txt b/paddle/fluid/distributed/index_dataset/CMakeLists.txt new file mode 100644 index 0000000000000..a30488494a52b --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/CMakeLists.txt @@ -0,0 +1,7 @@ +proto_library(index_dataset_proto SRCS index_dataset.proto) +cc_library(index_wrapper SRCS index_wrapper.cc DEPS index_dataset_proto fs) +cc_library(index_sampler SRCS index_sampler.cc DEPS index_wrapper) + +if(WITH_PYTHON) + py_proto_compile(index_dataset_py_proto SRCS index_dataset.proto) +endif() diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/distributed/index_dataset/index_dataset.proto similarity index 55% rename from paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc rename to paddle/fluid/distributed/index_dataset/index_dataset.proto index 3f3b6b959e301..1b4ee313671ad 100644 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc +++ b/paddle/fluid/distributed/index_dataset/index_dataset.proto @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,16 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" +syntax = "proto2"; +package paddle.distributed; -namespace paddle { -namespace operators { -namespace distributed { +message IndexNode { + required uint64 id = 1; + required bool is_leaf = 2; + required float probability = 3; +} -std::once_flag AsyncSparseParamUpdateRecorder::init_flag_; -std::unique_ptr - AsyncSparseParamUpdateRecorder::recorder_(nullptr); +message TreeMeta { + required int32 height = 1; + required int32 branch = 2; +} -} // namespace distributed -} // namespace operators -} // namespace paddle +message KVItem { + required bytes key = 1; + required bytes value = 2; +} \ No newline at end of file diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc new file mode 100644 index 0000000000000..58f85d98fb09c --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/index_dataset/index_sampler.h" +#include "paddle/fluid/operators/math/sampler.h" + +namespace paddle { +namespace distributed { + +using Sampler = paddle::operators::math::Sampler; + +std::vector> LayerWiseSampler::sample( + const std::vector>& user_inputs, + const std::vector& target_ids, bool with_hierarchy) { + auto input_num = target_ids.size(); + auto user_feature_num = user_inputs[0].size(); + std::vector> outputs( + input_num * layer_counts_sum_, + std::vector(user_feature_num + 2)); + + auto max_layer = tree_->Height(); + std::vector sampler_vec(max_layer - start_sample_layer_); + std::vector> layer_ids(max_layer - + start_sample_layer_); + + auto layer_index = max_layer - 1; + size_t idx = 0; + while (layer_index >= start_sample_layer_) { + auto layer_codes = tree_->GetLayerCodes(layer_index); + layer_ids[idx] = tree_->GetNodes(layer_codes); + sampler_vec[idx] = new paddle::operators::math::UniformSampler( + layer_ids[idx].size() - 1, seed_); + layer_index--; + idx++; + } + + idx = 0; + for (size_t i = 0; i < input_num; i++) { + auto travel_codes = + tree_->GetTravelCodes(target_ids[i], start_sample_layer_); + auto travel_path = tree_->GetNodes(travel_codes); + for (size_t j = 0; j < travel_path.size(); j++) { + // user + if (j > 0 && with_hierarchy) { + auto ancestor_codes = + tree_->GetAncestorCodes(user_inputs[i], max_layer - j - 1); + auto hierarchical_user = tree_->GetNodes(ancestor_codes); + for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) { + for (size_t k = 0; k < user_feature_num; k++) { + outputs[idx + idx_offset][k] = hierarchical_user[k].id(); + } + } + } else { + for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) { + for (size_t k = 0; k < user_feature_num; k++) { + outputs[idx + idx_offset][k] = user_inputs[i][k]; + } + } + } + + // sampler ++ + outputs[idx][user_feature_num] = travel_path[j].id(); + outputs[idx][user_feature_num + 1] = 1.0; + idx += 1; + for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) { + int sample_res = 0; + do { + sample_res = sampler_vec[j]->Sample(); + } while (layer_ids[j][sample_res].id() == travel_path[j].id()); + outputs[idx + idx_offset][user_feature_num] = + layer_ids[j][sample_res].id(); + outputs[idx + idx_offset][user_feature_num + 1] = 0; + } + idx += layer_counts_[j]; + } + } + for (size_t i = 0; i < sampler_vec.size(); i++) { + delete sampler_vec[i]; + } + return outputs; +} + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h new file mode 100644 index 0000000000000..66882bedc9b76 --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_sampler.h @@ -0,0 +1,100 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/distributed/index_dataset/index_wrapper.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class IndexSampler { + public: + virtual ~IndexSampler() {} + IndexSampler() {} + + template + static std::shared_ptr Init(const std::string& name) { + std::shared_ptr instance = nullptr; + instance.reset(new T(name)); + return instance; + } + + virtual void init_layerwise_conf(const std::vector& layer_sample_counts, + int start_sample_layer = 1, int seed = 0) {} + virtual void init_beamsearch_conf(const int64_t k) {} + virtual std::vector> sample( + const std::vector>& user_inputs, + const std::vector& input_targets, + bool with_hierarchy = false) = 0; +}; + +class LayerWiseSampler : public IndexSampler { + public: + virtual ~LayerWiseSampler() {} + explicit LayerWiseSampler(const std::string& name) { + tree_ = IndexWrapper::GetInstance()->get_tree_index(name); + } + + void init_layerwise_conf(const std::vector& layer_sample_counts, + int start_sample_layer, int seed) override { + seed_ = seed; + start_sample_layer_ = start_sample_layer; + + PADDLE_ENFORCE_GT( + start_sample_layer_, 0, + paddle::platform::errors::InvalidArgument( + "start sampler layer = [%d], it should greater than 0.", + start_sample_layer_)); + PADDLE_ENFORCE_LT(start_sample_layer_, tree_->Height(), + paddle::platform::errors::InvalidArgument( + "start sampler layer = [%d], it should less than " + "max_layer, which is [%d].", + start_sample_layer_, tree_->Height())); + + size_t i = 0; + layer_counts_sum_ = 0; + layer_counts_.clear(); + int cur_layer = start_sample_layer_; + while (cur_layer < tree_->Height()) { + int layer_sample_num = 1; + if (i < layer_sample_counts.size()) { + layer_sample_num = layer_sample_counts[i]; + } + layer_counts_sum_ += layer_sample_num + 1; + layer_counts_.push_back(layer_sample_num); + VLOG(3) << "[INFO] level " << cur_layer + << " sample_layer_counts.push_back: " << layer_sample_num; + cur_layer += 1; + i += 1; + } + reverse(layer_counts_.begin(), layer_counts_.end()); + VLOG(3) << "sample counts sum: " << layer_counts_sum_; + } + std::vector> sample( + const std::vector>& user_inputs, + const std::vector& target_ids, bool with_hierarchy) override; + + private: + std::vector layer_counts_; + int64_t layer_counts_sum_{0}; + std::shared_ptr tree_{nullptr}; + int seed_{0}; + int start_sample_layer_{1}; +}; + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc new file mode 100644 index 0000000000000..99fe4ca0c6d04 --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc @@ -0,0 +1,196 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/io/fs.h" + +#include +#include +#include "paddle/fluid/distributed/index_dataset/index_wrapper.h" + +namespace paddle { +namespace distributed { + +std::shared_ptr IndexWrapper::s_instance_(nullptr); + +int TreeIndex::Load(const std::string filename) { + int err_no; + auto fp = paddle::framework::fs_open_read(filename, &err_no, ""); + PADDLE_ENFORCE_NE( + fp, nullptr, + platform::errors::InvalidArgument( + "Open file %s failed. Please check whether the file exists.", + filename)); + + int num = 0; + max_id_ = 0; + fake_node_.set_id(0); + fake_node_.set_is_leaf(false); + fake_node_.set_probability(0.0); + max_code_ = 0; + size_t ret = fread(&num, sizeof(num), 1, fp.get()); + while (ret == 1 && num > 0) { + std::string content(num, '\0'); + size_t read_num = + fread(const_cast(content.data()), 1, num, fp.get()); + PADDLE_ENFORCE_EQ( + read_num, static_cast(num), + platform::errors::InvalidArgument( + "Read from file: %s failed. Valid Format is " + "an integer representing the length of the following string, " + "and the string itself.We got an iteger[% d], " + "but the following string's length is [%d].", + filename, num, read_num)); + + KVItem item; + PADDLE_ENFORCE_EQ( + item.ParseFromString(content), true, + platform::errors::InvalidArgument("Parse from file: %s failed. It's " + "content can't be parsed by KVItem.", + filename)); + + if (item.key() == ".tree_meta") { + meta_.ParseFromString(item.value()); + } else { + auto code = boost::lexical_cast(item.key()); + IndexNode node; + node.ParseFromString(item.value()); + PADDLE_ENFORCE_NE(node.id(), 0, + platform::errors::InvalidArgument( + "Node'id should not be equel to zero.")); + if (node.is_leaf()) { + id_codes_map_[node.id()] = code; + } + data_[code] = node; + if (node.id() > max_id_) { + max_id_ = node.id(); + } + if (code > max_code_) { + max_code_ = code; + } + } + ret = fread(&num, sizeof(num), 1, fp.get()); + } + total_nodes_num_ = data_.size(); + max_code_ += 1; + return 0; +} + +std::vector TreeIndex::GetNodes(const std::vector& codes) { + std::vector nodes; + nodes.reserve(codes.size()); + for (size_t i = 0; i < codes.size(); i++) { + if (CheckIsValid(codes[i])) { + nodes.push_back(data_.at(codes[i])); + } else { + nodes.push_back(fake_node_); + } + } + return nodes; +} + +std::vector TreeIndex::GetLayerCodes(int level) { + uint64_t level_num = static_cast(std::pow(meta_.branch(), level)); + uint64_t level_offset = level_num - 1; + + std::vector res; + res.reserve(level_num); + for (uint64_t i = 0; i < level_num; i++) { + auto code = level_offset + i; + if (CheckIsValid(code)) { + res.push_back(code); + } + } + return res; +} + +std::vector TreeIndex::GetAncestorCodes( + const std::vector& ids, int level) { + std::vector res; + res.reserve(ids.size()); + + int cur_level; + for (size_t i = 0; i < ids.size(); i++) { + if (id_codes_map_.find(ids[i]) == id_codes_map_.end()) { + res.push_back(max_code_); + } else { + auto code = id_codes_map_.at(ids[i]); + cur_level = meta_.height() - 1; + + while (level >= 0 && cur_level > level) { + code = (code - 1) / meta_.branch(); + cur_level--; + } + res.push_back(code); + } + } + return res; +} + +std::vector TreeIndex::GetChildrenCodes(uint64_t ancestor, + int level) { + auto level_code_num = static_cast(std::pow(meta_.branch(), level)); + auto code_min = level_code_num - 1; + auto code_max = meta_.branch() * level_code_num - 1; + + std::vector parent; + parent.push_back(ancestor); + std::vector res; + size_t p_idx = 0; + while (true) { + size_t p_size = parent.size(); + for (; p_idx < p_size; p_idx++) { + for (int i = 0; i < meta_.branch(); i++) { + auto code = parent[p_idx] * meta_.branch() + i + 1; + if (data_.find(code) != data_.end()) parent.push_back(code); + } + } + if ((code_min <= parent[p_idx]) && (parent[p_idx] < code_max)) { + break; + } + } + + return std::vector(parent.begin() + p_idx, parent.end()); +} + +std::vector TreeIndex::GetTravelCodes(uint64_t id, int start_level) { + std::vector res; + PADDLE_ENFORCE_NE(id_codes_map_.find(id), id_codes_map_.end(), + paddle::platform::errors::InvalidArgument( + "id = %d doesn't exist in Tree.", id)); + auto code = id_codes_map_.at(id); + int level = meta_.height() - 1; + + while (level >= start_level) { + res.push_back(code); + code = (code - 1) / meta_.branch(); + level--; + } + return res; +} + +std::vector TreeIndex::GetAllLeafs() { + std::vector res; + res.reserve(id_codes_map_.size()); + for (auto& ite : id_codes_map_) { + auto code = ite.second; + res.push_back(data_.at(code)); + } + return res; +} + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.h b/paddle/fluid/distributed/index_dataset/index_wrapper.h new file mode 100644 index 0000000000000..8fb8faf6c84a2 --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_wrapper.h @@ -0,0 +1,120 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/distributed/index_dataset/index_dataset.pb.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class Index { + public: + Index() {} + ~Index() {} +}; + +class TreeIndex : public Index { + public: + TreeIndex() {} + ~TreeIndex() {} + + int Height() { return meta_.height(); } + int Branch() { return meta_.branch(); } + uint64_t TotalNodeNums() { return total_nodes_num_; } + uint64_t EmbSize() { return max_id_ + 1; } + int Load(const std::string path); + + inline bool CheckIsValid(int code) { + if (data_.find(code) != data_.end()) { + return true; + } else { + return false; + } + } + + std::vector GetNodes(const std::vector& codes); + std::vector GetLayerCodes(int level); + std::vector GetAncestorCodes(const std::vector& ids, + int level); + std::vector GetChildrenCodes(uint64_t ancestor, int level); + std::vector GetTravelCodes(uint64_t id, int start_level); + std::vector GetAllLeafs(); + + std::unordered_map data_; + std::unordered_map id_codes_map_; + uint64_t total_nodes_num_; + TreeMeta meta_; + uint64_t max_id_; + uint64_t max_code_; + IndexNode fake_node_; +}; + +using TreePtr = std::shared_ptr; + +class IndexWrapper { + public: + virtual ~IndexWrapper() {} + IndexWrapper() {} + + void clear_tree() { tree_map.clear(); } + + TreePtr get_tree_index(const std::string name) { + PADDLE_ENFORCE_NE(tree_map.find(name), tree_map.end(), + paddle::platform::errors::InvalidArgument( + "tree [%s] doesn't exist. Please insert it firstly " + "by API[\' insert_tree_index \'].", + name)); + return tree_map[name]; + } + + void insert_tree_index(const std::string name, const std::string tree_path) { + if (tree_map.find(name) != tree_map.end()) { + VLOG(0) << "Tree " << name << " has already existed."; + return; + } + TreePtr tree = std::make_shared(); + int ret = tree->Load(tree_path); + PADDLE_ENFORCE_EQ(ret, 0, paddle::platform::errors::InvalidArgument( + "Load tree[%s] from path[%s] failed. Please " + "check whether the file exists.", + name, tree_path)); + tree_map.insert(std::pair{name, tree}); + } + + static std::shared_ptr GetInstancePtr() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::distributed::IndexWrapper()); + } + return s_instance_; + } + + static IndexWrapper* GetInstance() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::distributed::IndexWrapper()); + } + return s_instance_.get(); + } + + private: + static std::shared_ptr s_instance_; + std::unordered_map tree_map; +}; + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/service/CMakeLists.txt b/paddle/fluid/distributed/service/CMakeLists.txt index 843dea9eea6ef..d1f04e26ade72 100644 --- a/paddle/fluid/distributed/service/CMakeLists.txt +++ b/paddle/fluid/distributed/service/CMakeLists.txt @@ -16,6 +16,7 @@ set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT set_source_files_properties(service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) @@ -29,7 +30,8 @@ set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DIST cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS}) cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) -cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) +cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc +ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS}) cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS}) diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/service/brpc_ps_client.cc index b49a71ab0c13a..a6ad9d08f52fd 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/service/brpc_ps_client.cc @@ -880,8 +880,8 @@ std::future BrpcPsClient::send_client2client_msg( auto promise = std::make_shared>(); std::future fut = promise->get_future(); if (to_client_id >= _client_channels.size()) { - LOG(FATAL) << "to_client_id is out of range clients, which size is " - << _client_channels.size(); + VLOG(0) << "to_client_id is out of range clients, which size is " + << _client_channels.size(); promise->set_value(-1); return fut; } @@ -1001,4 +1001,4 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id, } } // namespace distributed -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h index 043fe9d83dfc5..fa60cab2b5877 100644 --- a/paddle/fluid/distributed/service/communicator.h +++ b/paddle/fluid/distributed/service/communicator.h @@ -310,6 +310,8 @@ class Communicator { return _worker_ptr; } + RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; } + std::shared_ptr _worker_ptr; // pointer to worker protected: diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc index a6271cac83c9a..eafb4d596cc16 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -135,7 +135,8 @@ std::future GraphBrpcClient::get_node_feat( closure->request(request_idx) ->add_params(joint_feature_name.c_str(), joint_feature_name.size()); - PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), closure->response(request_idx), closure); diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h index e185f23e3d240..c6657be96ba44 100644 --- a/paddle/fluid/distributed/service/graph_py_service.h +++ b/paddle/fluid/distributed/service/graph_py_service.h @@ -54,19 +54,7 @@ class GraphPyService { std::vector table_feat_conf_feat_dtype; std::vector table_feat_conf_feat_shape; - // std::thread *server_thread, *client_thread; - - // std::shared_ptr pserver_ptr; - - // std::shared_ptr worker_ptr; - public: - // std::shared_ptr get_ps_server() { - // return pserver_ptr; - // } - // std::shared_ptr get_ps_client() { - // return worker_ptr; - // } int get_shard_num() { return shard_num; } void set_shard_num(int shard_num) { this->shard_num = shard_num; } void GetDownpourSparseTableProto( diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/service/ps_client.cc index 3f78908baa3b1..d45f41a0f58de 100644 --- a/paddle/fluid/distributed/service/ps_client.cc +++ b/paddle/fluid/distributed/service/ps_client.cc @@ -16,12 +16,15 @@ #include "glog/logging.h" #include "paddle/fluid/distributed/service/brpc_ps_client.h" #include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/ps_local_client.h" #include "paddle/fluid/distributed/table/table.h" namespace paddle { namespace distributed { REGISTER_PSCORE_CLASS(PSClient, BrpcPsClient); +REGISTER_PSCORE_CLASS(PSClient, PsLocalClient); REGISTER_PSCORE_CLASS(PSClient, GraphBrpcClient); + int32_t PSClient::configure( const PSParameter &config, const std::map> ®ions, @@ -83,4 +86,4 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) { return client; } } // namespace distributed -} // namespace paddle \ No newline at end of file +} // namespace paddle diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h index 1c8abc6c2e8dc..74a1e0dde71fc 100644 --- a/paddle/fluid/distributed/service/ps_client.h +++ b/paddle/fluid/distributed/service/ps_client.h @@ -118,6 +118,17 @@ class PSClient { const uint64_t *keys, size_t num, bool is_training) = 0; + virtual ::std::future pull_sparse_ptr(char **select_values, + size_t table_id, + const uint64_t *keys, + size_t num) { + VLOG(0) << "Did not implement"; + std::promise promise; + std::future fut = promise.get_future(); + promise.set_value(-1); + return fut; + } + virtual std::future print_table_stat(uint32_t table_id) = 0; // 确保所有积攒中的请求都发起发送 @@ -150,7 +161,7 @@ class PSClient { virtual std::future send_client2client_msg(int msg_type, int to_client_id, const std::string &msg) { - LOG(FATAL) << "Did not implement"; + VLOG(0) << "Did not implement"; std::promise promise; std::future fut = promise.get_future(); promise.set_value(-1); diff --git a/paddle/fluid/distributed/service/ps_local_client.cc b/paddle/fluid/distributed/service/ps_local_client.cc new file mode 100644 index 0000000000000..2acc845a50890 --- /dev/null +++ b/paddle/fluid/distributed/service/ps_local_client.cc @@ -0,0 +1,269 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/ps_local_client.h" +#include "paddle/fluid/distributed/table/table.h" + +//#define pslib_debug_dense_compress + +namespace paddle { +namespace distributed { +int32_t PsLocalClient::initialize() { + const auto& downpour_param = _config.server_param().downpour_server_param(); + TableManager::instance().initialize(); + for (size_t i = 0; i < downpour_param.downpour_table_param_size(); ++i) { + auto* table = CREATE_PSCORE_CLASS( + Table, downpour_param.downpour_table_param(i).table_class()); + table->initialize(downpour_param.downpour_table_param(i), + _config.fs_client_param()); + table->set_shard(0, 1); + _table_map[downpour_param.downpour_table_param(i).table_id()].reset(table); + } + return 0; +} + +::std::future PsLocalClient::shrink(uint32_t table_id, + const std::string threshold) { + // TODO + return done(); +} + +::std::future PsLocalClient::load(const std::string& epoch, + const std::string& mode) { + // TODO + // for (auto& it : _table_map) { + // load(it.first, epoch, mode); + //} + return done(); +} +::std::future PsLocalClient::load(uint32_t table_id, + const std::string& epoch, + const std::string& mode) { + // TODO + // auto* table_ptr = table(table_id); + // table_ptr->load(epoch, mode); + return done(); +} + +::std::future PsLocalClient::save(const std::string& epoch, + const std::string& mode) { + // TODO + for (auto& it : _table_map) { + save(it.first, epoch, mode); + } + return done(); +} +::std::future PsLocalClient::save(uint32_t table_id, + const std::string& epoch, + const std::string& mode) { + // TODO + auto* table_ptr = table(table_id); + table_ptr->flush(); + table_ptr->save(epoch, mode); + return done(); +} + +::std::future PsLocalClient::clear() { + // TODO + return done(); +} +::std::future PsLocalClient::clear(uint32_t table_id) { + // TODO + return done(); +} + +::std::future PsLocalClient::flush() { + // no need + return done(); +} + +::std::future PsLocalClient::stop_server() { + // no need + return done(); +} + +::std::future PsLocalClient::pull_dense(Region* regions, + size_t region_num, + size_t table_id) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + uint32_t num_per_shard = dense_dim_per_shard(accessor->fea_dim(), 1); + std::vector region_buffer; + region_buffer.resize(num_per_shard); + table_ptr->pull_dense(region_buffer.data(), region_buffer.size()); + + size_t region_idx = 0; + size_t region_data_idx = 0; + size_t shard_data_size = num_per_shard; + size_t shard_buffer_remain = shard_data_size * sizeof(float); + PADDLE_ENFORCE_EQ( + shard_buffer_remain, region_buffer.size() * sizeof(float), + platform::errors::PreconditionNotMet("pull dense size error.")); + size_t index = 0; + while (shard_buffer_remain > 0 && region_idx < region_num) { + auto& region = regions[region_idx]; + if (region.size - region_data_idx >= shard_buffer_remain) { + memcpy((void*)(region.data + region_data_idx), + (uint8_t*)(void*)(region_buffer.data()) + index, + shard_buffer_remain); + region_data_idx += shard_buffer_remain; + shard_buffer_remain = 0; + } else if (region.size - region_data_idx == 0) { + ++region_idx; + region_data_idx = 0; + } else { + memcpy((void*)(region.data + region_data_idx), + (uint8_t*)(void*)(region_buffer.data()) + index, + region.size - region_data_idx); + shard_buffer_remain -= (region.size - region_data_idx); + index += (region.size - region_data_idx); + ++region_idx; + region_data_idx = 0; + } + } + + return done(); +} + +::std::future PsLocalClient::push_dense_param(const Region* regions, + size_t region_num, + size_t table_id) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + std::vector region_buffer; + region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1), 0); + for (size_t i = 0, offset = 0; i < region_num; ++i) { + uint32_t data_num = regions[i].size / sizeof(float); + memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size); + offset += data_num; + } + + // table_ptr->push_dense_param(region_buffer.data(), region_buffer.size()); + + return done(); +} + +::std::future PsLocalClient::push_dense_raw_gradient( + int table_id, float* total_send_data, size_t total_send_data_size, + void* callback) { + VLOG(1) << "wxx push_dense_raw_gradient"; + + PSClientClosure* closure = reinterpret_cast(callback); + + auto* table_ptr = table(table_id); + + table_ptr->push_dense(total_send_data, total_send_data_size); + delete closure; + return done(); +} + +::std::future PsLocalClient::push_dense(const Region* regions, + size_t region_num, + size_t table_id) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + std::vector region_buffer; + region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1)); + size_t data_size = region_buffer.size(); + for (size_t i = 0, offset = 0; i < region_num; ++i) { + uint32_t data_num = regions[i].size / sizeof(float); + PADDLE_ENFORCE_LE( + offset + data_num, data_size, + platform::errors::PreconditionNotMet( + "invalid dense size, cur pos[%d] data_num[%d] size[%d]", offset, + data_num, data_size)); + memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size); + offset += data_num; + } + + table_ptr->push_dense(region_buffer.data(), region_buffer.size()); + + return done(); +} + +//::std::future PsLocalClient::pull_sparse(float** select_values, +// size_t table_id, +// const uint64_t* keys, +// size_t num) { +// // FIXME +// // auto timer = +// // std::make_shared("pslib_downpour_client_pull_sparse"); +// // auto local_timer = +// // std::make_shared("pslib_downpour_client_pull_sparse_local"); +// //将key拆分到各shard请求,并记录原始对应value指针 +// auto* accessor = table_accessor(table_id); +// auto* table_ptr = table(table_id); +// size_t value_size = accessor->select_size(); +// +// // table_ptr->pull_sparse(keys, num); +// std::vector res_data; +// res_data.resize(num * value_size / sizeof(float)); +// table_ptr->pull_sparse(res_data.data(), keys, num); +// // memcpy(select_values[0], res_data->data(), res_data->size() * +// // sizeof(float)); +// size_t offset = 0; +// for (int i = 0; i < num; ++i) { +// memcpy(select_values[i], (char*)res_data.data() + offset, value_size); +// offset += value_size; +// } +// +// // return fut; +// return done(); +//} + +::std::future PsLocalClient::pull_sparse_ptr(char** select_values, + size_t table_id, + const uint64_t* keys, + size_t num) { + // FIXME + // auto timer = + // std::make_shared("pslib_downpour_client_pull_sparse"); + // auto local_timer = + // std::make_shared("pslib_downpour_client_pull_sparse_local"); + //将key拆分到各shard请求,并记录原始对应value指针 + auto* table_ptr = table(table_id); + + table_ptr->pull_sparse_ptr(select_values, keys, num); + + return done(); +} + +::std::future PsLocalClient::push_sparse_raw_gradient( + size_t table_id, const uint64_t* keys, const float** update_values, + size_t num, void* callback) { + VLOG(1) << "wxx push_sparse_raw_gradient"; + PSClientClosure* closure = reinterpret_cast(callback); + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + table_ptr->push_sparse(keys, update_values, num); + delete closure; + return done(); +} + +::std::future PsLocalClient::push_sparse(size_t table_id, + const uint64_t* keys, + const float** update_values, + size_t num) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + table_ptr->push_sparse(keys, update_values, num); + return done(); +} +} +} diff --git a/paddle/fluid/distributed/service/ps_local_client.h b/paddle/fluid/distributed/service/ps_local_client.h new file mode 100644 index 0000000000000..9d2b01a45fe92 --- /dev/null +++ b/paddle/fluid/distributed/service/ps_local_client.h @@ -0,0 +1,226 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License 0// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "brpc/channel.h" +#include "brpc/controller.h" +#include "brpc/server.h" +#include "paddle/fluid/distributed/service/ps_client.h" + +namespace paddle { +namespace distributed { + +class Table; + +class PsLocalClient : public PSClient { + public: + PsLocalClient() {} + virtual ~PsLocalClient() { _running = false; } + virtual int32_t create_client2client_connection(int pslib_timeout_ms, + int pslib_connect_timeout_ms, + int max_retry) { + return 0; + } + + virtual ::std::future shrink(uint32_t table_id, + const std::string threshold) override; + virtual ::std::future load(const std::string& epoch, + const std::string& mode) override; + virtual ::std::future load(uint32_t table_id, + const std::string& epoch, + const std::string& mode) override; + + virtual ::std::future save(const std::string& epoch, + const std::string& mode) override; + virtual ::std::future save(uint32_t table_id, + const std::string& epoch, + const std::string& mode) override; + + virtual ::std::future clear() override; + virtual ::std::future clear(uint32_t table_id) override; + + virtual ::std::future stop_server() override; + + virtual void finalize_worker() override {} + virtual ::std::future pull_dense(Region* regions, size_t region_num, + size_t table_id); + + virtual ::std::future push_dense(const Region* regions, + size_t region_num, size_t table_id); + + virtual ::std::future push_dense_param(const Region* regions, + size_t region_num, + size_t table_id); + + virtual ::std::future pull_sparse(float** select_values, + size_t table_id, + const uint64_t* keys, size_t num, + bool is_training) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual ::std::future pull_sparse_ptr(char** select_values, + size_t table_id, + const uint64_t* keys, + size_t num); + + virtual ::std::future print_table_stat(uint32_t table_id) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + virtual ::std::future push_sparse(size_t table_id, + const uint64_t* keys, + const float** update_values, + size_t num); + + virtual ::std::future flush(); + // server profilera + virtual std::future start_profiler() { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + }; + + virtual std::future stop_profiler() { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future barrier(size_t table_id, uint32_t barrier_type) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future pull_geo_param(size_t table_id, + std::vector* values, + std::vector* keys, + int pserver_idx) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future push_global_step(int table_id, + int64_t* total_send_data, + void* done) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + // recv table from server and save it in LodTensor + virtual int32_t recv_and_save_table(const uint64_t table_id, + const std::string& path) { + return 0; + } + + virtual ::std::future send_client2client_msg( + int msg_type, int to_client_id, const std::string& msg) override { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + virtual size_t get_server_nums() { return 1; } + + virtual std::future push_dense_raw_gradient( + int table_id, float* total_send_data, size_t total_send_data_size, + void* callback) override; + + virtual std::future push_sparse_raw_gradient( + size_t table_id, const uint64_t* keys, const float** update_values, + size_t num, void* callback) override; + + virtual std::future push_sparse_raw_gradient_partial( + size_t table_id, const uint64_t* keys, const float** update_values, + uint32_t num, void* done, int pserver_idx) override { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future push_sparse_param(size_t table_id, + const uint64_t* keys, + const float** update_values, + size_t num, + void* done) override { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + private: + virtual int32_t initialize() override; + + std::future done() { + std::shared_ptr> prom = + std::make_shared>(); + std::future fut = prom->get_future(); + prom->set_value(0); + return fut; + } + + inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total, + uint32_t shard_num) { + return dense_dim_total / shard_num + 1; + } + + inline std::unordered_map>* table() { + return &_table_map; + } + + inline Table* table(size_t table_id) { + auto itr = _table_map.find(table_id); + if (itr != _table_map.end()) { + return itr->second.get(); + } + LOG(ERROR) << "table not found " << table_id; + return NULL; + } + + std::unordered_map> _table_map; + + bool _running = false; + bool _flushing = false; + + private: + float _mae = 0; + float _mse = 0; + uint16_t _push_times = 0; +}; +} +} diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/distributed/service/ps_local_server.h similarity index 56% rename from paddle/fluid/operators/distributed/parameter_send.h rename to paddle/fluid/distributed/service/ps_local_server.h index 4335ef8c73cc0..dfbccc70900e3 100644 --- a/paddle/fluid/operators/distributed/parameter_send.h +++ b/paddle/fluid/distributed/service/ps_local_server.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,22 +14,24 @@ #pragma once -#include +#include #include - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" +#include "paddle/fluid/distributed/service/server.h" namespace paddle { -namespace operators { namespace distributed { -template -struct ParameterSend { - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope, - bool sync, int multi_parts); -}; +class PsLocalServer : public PSServer { + public: + PsLocalServer() {} + virtual ~PsLocalServer() {} + virtual uint64_t start() { return 0; } + virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; } + virtual int32_t stop() { return 0; } + virtual int32_t port() { return 0; } -}; // namespace distributed -}; // namespace operators -}; // namespace paddle + private: + virtual int32_t initialize() { return 0; } +}; +} +} diff --git a/paddle/fluid/distributed/service/server.cc b/paddle/fluid/distributed/service/server.cc index 9324adad6979e..e44876e3d2b78 100644 --- a/paddle/fluid/distributed/service/server.cc +++ b/paddle/fluid/distributed/service/server.cc @@ -17,12 +17,14 @@ #include "glog/logging.h" #include "paddle/fluid/distributed/service/brpc_ps_server.h" #include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/ps_local_server.h" #include "paddle/fluid/distributed/table/table.h" namespace paddle { namespace distributed { REGISTER_PSCORE_CLASS(PSServer, BrpcPsServer); +REGISTER_PSCORE_CLASS(PSServer, PsLocalServer); REGISTER_PSCORE_CLASS(PsBaseService, BrpcPsService); REGISTER_PSCORE_CLASS(PSServer, GraphBrpcServer); REGISTER_PSCORE_CLASS(PsBaseService, GraphBrpcService); diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc index 020bcdcc52ef4..0dc99de1bfe82 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -171,7 +171,7 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { auto paths = paddle::string::split_string(path, ";"); - int count = 0; + int64_t count = 0; std::string sample_type = "random"; bool is_weighted = false; int valid_count = 0; diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index 8ddf3c8f904a6..b18da82abe61c 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -33,26 +33,11 @@ namespace paddle { namespace distributed { class GraphShard { public: - // static int bucket_low_bound; - // static int gcd(int s, int t) { - // if (s % t == 0) return t; - // return gcd(t, s % t); - // } size_t get_size(); GraphShard() {} - GraphShard(int shard_num) { - this->shard_num = shard_num; - // bucket_size = init_bucket_size(shard_num); - // bucket.resize(bucket_size); - } + GraphShard(int shard_num) { this->shard_num = shard_num; } std::vector &get_bucket() { return bucket; } std::vector get_batch(int start, int end, int step); - // int init_bucket_size(int shard_num) { - // for (int i = bucket_low_bound;; i++) { - // if (gcd(i, shard_num) == 1) return i; - // } - // return -1; - // } std::vector get_ids_by_range(int start, int end) { std::vector res; for (int i = start; i < end && i < bucket.size(); i++) { @@ -64,7 +49,6 @@ class GraphShard { FeatureNode *add_feature_node(uint64_t id); Node *find_node(uint64_t id); void add_neighboor(uint64_t id, uint64_t dst_id, float weight); - // std::unordered_map::iterator> std::unordered_map get_node_location() { return node_location; } @@ -131,7 +115,7 @@ class GraphTable : public SparseTable { protected: std::vector shards; size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num; - const int task_pool_size_ = 11; + const int task_pool_size_ = 24; const int random_sample_nodes_ranges = 3; std::vector feat_name; diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc index a25a90aa9a7c1..1c315d34abcb6 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/table/common_sparse_table.cc @@ -126,17 +126,17 @@ void ProcessALine(const std::vector& columns, const Meta& meta, int64_t SaveToText(std::ostream* os, std::shared_ptr block, const int mode) { int64_t not_save_num = 0; - for (auto value : block->values_) { - if (mode == SaveMode::delta && !value.second->need_save_) { + for (auto& value : block->values_) { + if (mode == SaveMode::delta && !value.second.need_save_) { not_save_num++; continue; } - auto* vs = value.second->data_.data(); + auto* vs = value.second.data_; std::stringstream ss; auto id = value.first; - ss << id << "\t" << value.second->count_ << "\t" - << value.second->unseen_days_ << "\t" << value.second->is_entry_ << "\t"; + ss << id << "\t" << value.second.count_ << "\t" << value.second.unseen_days_ + << "\t" << value.second.is_entry_ << "\t"; for (int i = 0; i < block->value_length_; i++) { ss << vs[i]; @@ -148,7 +148,7 @@ int64_t SaveToText(std::ostream* os, std::shared_ptr block, os->write(ss.str().c_str(), sizeof(char) * ss.str().size()); if (mode == SaveMode::base || mode == SaveMode::delta) { - value.second->need_save_ = false; + value.second.need_save_ = false; } } @@ -446,6 +446,43 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, return 0; } +int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values, + const uint64_t* keys, size_t num) { + std::vector> offset_bucket; + offset_bucket.resize(task_pool_size_); + + for (int x = 0; x < num; ++x) { + auto y = keys[x] % task_pool_size_; + offset_bucket[y].push_back(x); + } + + std::vector> tasks(task_pool_size_); + + for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) { + tasks[shard_id] = _shards_task_pool[shard_id]->enqueue( + [this, shard_id, &keys, &offset_bucket, &pull_values]() -> int { + auto& block = shard_values_[shard_id]; + auto& offsets = offset_bucket[shard_id]; + + for (int i = 0; i < offsets.size(); ++i) { + auto offset = offsets[i]; + auto id = keys[offset]; + auto* value = block->InitGet(id); + // std::copy_n(value + param_offset_, param_dim_, + // pull_values + param_dim_ * offset); + pull_values[offset] = (char*)value; + } + + return 0; + }); + } + + for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { + tasks[shard_id].wait(); + } + return 0; +} + int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, const float* values, size_t num) { rwlock_->RDLock(); @@ -502,6 +539,45 @@ int32_t CommonSparseTable::push_sparse(const uint64_t* keys, return 0; } +int32_t CommonSparseTable::push_sparse(const uint64_t* keys, + const float** values, size_t num) { + _push_sparse(keys, values, num); + return 0; +} + +int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, + const float** values, size_t num) { + rwlock_->RDLock(); + std::vector> offset_bucket; + offset_bucket.resize(task_pool_size_); + + for (int x = 0; x < num; ++x) { + auto y = keys[x] % task_pool_size_; + offset_bucket[y].push_back(x); + } + + std::vector> tasks(task_pool_size_); + + for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) { + tasks[shard_id] = _shards_task_pool[shard_id]->enqueue( + [this, shard_id, &keys, &values, num, &offset_bucket]() -> int { + auto& offsets = offset_bucket[shard_id]; + for (size_t i = 0; i < offsets.size(); ++i) { + std::vector tmp_off = {0}; + optimizer_->update(keys + offsets[i], values[offsets[i]], num, + tmp_off, shard_values_[shard_id].get()); + } + return 0; + }); + } + + for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { + tasks[shard_id].wait(); + } + rwlock_->UNLock(); + return 0; +} + int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys, const float* values, size_t num) { rwlock_->RDLock(); diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h index 31f4dabcdfdd7..50c295da53464 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.h +++ b/paddle/fluid/distributed/table/common_sparse_table.h @@ -63,9 +63,15 @@ class CommonSparseTable : public SparseTable { virtual std::pair print_table_stat(); virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value); + virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys, + size_t num); + virtual int32_t push_sparse(const uint64_t* keys, const float* values, size_t num); + virtual int32_t push_sparse(const uint64_t* keys, const float** values, + size_t num); + // only for sparse geo table virtual int32_t push_sparse_param(const uint64_t* keys, const float* values, size_t num); @@ -80,6 +86,8 @@ class CommonSparseTable : public SparseTable { protected: virtual int32_t _push_sparse(const uint64_t* keys, const float* values, size_t num); + virtual int32_t _push_sparse(const uint64_t* keys, const float** values, + size_t num); private: const int task_pool_size_ = 11; diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h index cb077033cad42..bb4174bd2c579 100644 --- a/paddle/fluid/distributed/table/depends/large_scale_kv.h +++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h @@ -28,6 +28,7 @@ #include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/distributed/table/depends/initializers.h" +#include "paddle/fluid/distributed/thirdparty/round_robin.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/rw_lock.h" @@ -54,23 +55,53 @@ struct VALUE { unseen_days_(0), need_save_(false), is_entry_(false) { - data_.resize(length); - memset(data_.data(), 0, sizeof(float) * length); + data_ = new float[length]; + memset(data_, 0, sizeof(float) * length); + } + + VALUE(const VALUE &value) { + length_ = value.length_; + count_ = value.count_; + unseen_days_ = value.unseen_days_; + need_save_ = value.need_save_; + is_entry_ = value.is_entry_; + data_ = new float[length_]; + memcpy(data_, value.data_, sizeof(float) * length_); + } + + VALUE &operator=(const VALUE &value) { + if (this != &value) { + delete[] data_; + length_ = value.length_; + count_ = value.count_; + unseen_days_ = value.unseen_days_; + need_save_ = value.need_save_; + is_entry_ = value.is_entry_; + + data_ = new float[length_]; + memcpy(data_, value.data_, sizeof(float) * length_); + } + return *this; + } + + ~VALUE() { + delete[] data_; + data_ = nullptr; } size_t length_; - std::vector data_; int count_; int unseen_days_; // use to check knock-out bool need_save_; // whether need to save bool is_entry_; // whether knock-in + float *data_; }; -inline bool count_entry(std::shared_ptr value, int threshold) { +inline bool count_entry(VALUE *value, int threshold) { return value->count_ >= threshold; } -inline bool probility_entry(std::shared_ptr value, float threshold) { +inline bool probility_entry(VALUE *value, float threshold) { UniformInitializer uniform = UniformInitializer({"uniform", "0", "0", "1"}); return uniform.GetValue() >= threshold; } @@ -87,7 +118,7 @@ class ValueBlock { value_dims_(value_dims), value_offsets_(value_offsets), value_idx_(value_idx) { - for (int x = 0; x < value_dims.size(); ++x) { + for (size_t x = 0; x < value_dims.size(); ++x) { value_length_ += value_dims[x]; } @@ -96,13 +127,15 @@ class ValueBlock { auto slices = string::split_string(entry_attr, ":"); if (slices[0] == "none") { entry_func_ = std::bind(&count_entry, std::placeholders::_1, 0); + threshold_ = 0; } else if (slices[0] == "count_filter_entry") { - int threshold = std::stoi(slices[1]); - entry_func_ = std::bind(&count_entry, std::placeholders::_1, threshold); + threshold_ = std::stoi(slices[1]); + entry_func_ = + std::bind(&count_entry, std::placeholders::_1, threshold_); } else if (slices[0] == "probability_entry") { - float threshold = std::stof(slices[1]); + threshold_ = std::stof(slices[1]); entry_func_ = - std::bind(&probility_entry, std::placeholders::_1, threshold); + std::bind(&probility_entry, std::placeholders::_1, threshold_); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Not supported Entry Type : %s, Only support [CountFilterEntry, " @@ -148,7 +181,7 @@ class ValueBlock { PADDLE_ENFORCE_EQ( value_dims[i], value_dims_[i], platform::errors::InvalidArgument("value dims is not match")); - pts.push_back(values->data_.data() + + pts.push_back(values.data_ + value_offsets_.at(value_idx_.at(value_names[i]))); } return pts; @@ -158,19 +191,35 @@ class ValueBlock { float *Init(const uint64_t &id, const bool with_update = true, const int counter = 1) { if (!Has(id)) { - values_[id] = std::make_shared(value_length_); + values_.emplace(std::make_pair(id, VALUE(value_length_))); } auto &value = values_.at(id); if (with_update) { - AttrUpdate(value, counter); + AttrUpdate(&value, counter); } - return value->data_.data(); + return value.data_; } - void AttrUpdate(std::shared_ptr value, const int counter) { + + VALUE *InitGet(const uint64_t &id, const bool with_update = true, + const int counter = 1) { + if (!Has(id)) { + values_.emplace(std::make_pair(id, VALUE(value_length_))); + } + + auto &value = values_.at(id); + + if (with_update) { + AttrUpdate(&value, counter); + } + + return &value; + } + + void AttrUpdate(VALUE *value, const int counter) { // update state value->unseen_days_ = 0; value->count_ += counter; @@ -179,8 +228,8 @@ class ValueBlock { value->is_entry_ = entry_func_(value); if (value->is_entry_) { // initialize - for (int x = 0; x < value_names_.size(); ++x) { - initializers_[x]->GetValue(value->data_.data() + value_offsets_[x], + for (size_t x = 0; x < value_names_.size(); ++x) { + initializers_[x]->GetValue(value->data_ + value_offsets_[x], value_dims_[x]); } value->need_save_ = true; @@ -195,27 +244,27 @@ class ValueBlock { // dont jude if (has(id)) float *Get(const uint64_t &id) { auto &value = values_.at(id); - return value->data_.data(); + return value.data_; } // for load, to reset count, unseen_days - std::shared_ptr GetValue(const uint64_t &id) { return values_.at(id); } + VALUE *GetValue(const uint64_t &id) { return &values_.at(id); } bool GetEntry(const uint64_t &id) { auto &value = values_.at(id); - return value->is_entry_; + return value.is_entry_; } void SetEntry(const uint64_t &id, const bool state) { auto &value = values_.at(id); - value->is_entry_ = state; + value.is_entry_ = state; } void Shrink(const int threshold) { for (auto iter = values_.begin(); iter != values_.end();) { auto &value = iter->second; - value->unseen_days_++; - if (value->unseen_days_ >= threshold) { + value.unseen_days_++; + if (value.unseen_days_ >= threshold) { iter = values_.erase(iter); } else { ++iter; @@ -224,6 +273,8 @@ class ValueBlock { return; } + float GetThreshold() { return threshold_; } + private: bool Has(const uint64_t id) { auto got = values_.find(id); @@ -235,7 +286,7 @@ class ValueBlock { } public: - std::unordered_map> values_; + robin_hood::unordered_map values_; size_t value_length_ = 0; private: @@ -244,9 +295,11 @@ class ValueBlock { const std::vector &value_offsets_; const std::unordered_map &value_idx_; - std::function)> entry_func_; + std::function entry_func_; std::vector> initializers_; + float threshold_; }; } // namespace distributed } // namespace paddle + diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h index 5bc818ff4741f..81a1ff5eced2b 100644 --- a/paddle/fluid/distributed/table/table.h +++ b/paddle/fluid/distributed/table/table.h @@ -48,10 +48,17 @@ class Table { return 0; } + virtual int32_t pull_sparse_ptr(char **pull_values, const uint64_t *keys, + size_t num) { + VLOG(0) << "NOT IMPLEMENT"; + return 0; + } virtual int32_t pull_sparse(float *values, const PullSparseValue &pull_value) = 0; virtual int32_t push_sparse(const uint64_t *keys, const float *values, size_t num) = 0; + virtual int32_t push_sparse(const uint64_t *keys, const float **values, + size_t num){}; virtual int32_t push_sparse_param(const uint64_t *keys, const float *values, size_t num) { return 0; diff --git a/paddle/fluid/distributed/thirdparty/round_robin.h b/paddle/fluid/distributed/thirdparty/round_robin.h new file mode 100644 index 0000000000000..f5075b4545af0 --- /dev/null +++ b/paddle/fluid/distributed/thirdparty/round_robin.h @@ -0,0 +1,2685 @@ +// ______ _____ ______ _________ +// ______________ ___ /_ ___(_)_______ ___ /_ ______ ______ ______ / +// __ ___/_ __ \__ __ \__ / __ __ \ __ __ \_ __ \_ __ \_ __ / +// _ / / /_/ /_ /_/ /_ / _ / / / _ / / // /_/ // /_/ // /_/ / +// /_/ \____/ /_.___/ /_/ /_/ /_/ ________/_/ /_/ \____/ \____/ \__,_/ +// _/_____/ +// +// Fast & memory efficient hashtable based on robin hood hashing for +// C++11/14/17/20 +// https://github.com/martinus/robin-hood-hashing +// +// Licensed under the MIT License . +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2021 Martin Ankerl +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef ROBIN_HOOD_H_INCLUDED +#define ROBIN_HOOD_H_INCLUDED + +// see https://semver.org/ +#define ROBIN_HOOD_VERSION_MAJOR 3 // for incompatible API changes +#define ROBIN_HOOD_VERSION_MINOR \ + 11 // for adding functionality in a backwards-compatible manner +#define ROBIN_HOOD_VERSION_PATCH 1 // for backwards-compatible bug fixes + +#include +#include +#include +#include +#include // only to support hash of smart pointers +#include +#include +#include +#include +#if __cplusplus >= 201703L +#include +#endif + +// #define ROBIN_HOOD_LOG_ENABLED +#ifdef ROBIN_HOOD_LOG_ENABLED +#include +#define ROBIN_HOOD_LOG(...) \ + std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \ + << std::endl; +#else +#define ROBIN_HOOD_LOG(x) +#endif + +// #define ROBIN_HOOD_TRACE_ENABLED +#ifdef ROBIN_HOOD_TRACE_ENABLED +#include +#define ROBIN_HOOD_TRACE(...) \ + std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \ + << std::endl; +#else +#define ROBIN_HOOD_TRACE(x) +#endif + +// #define ROBIN_HOOD_COUNT_ENABLED +#ifdef ROBIN_HOOD_COUNT_ENABLED +#include +#define ROBIN_HOOD_COUNT(x) ++counts().x; +namespace robin_hood { +struct Counts { + uint64_t shiftUp{}; + uint64_t shiftDown{}; +}; +inline std::ostream &operator<<(std::ostream &os, Counts const &c) { + return os << c.shiftUp << " shiftUp" << std::endl + << c.shiftDown << " shiftDown" << std::endl; +} + +static Counts &counts() { + static Counts counts{}; + return counts; +} +} // namespace robin_hood +#else +#define ROBIN_HOOD_COUNT(x) +#endif + +// all non-argument macros should use this facility. See +// https://www.fluentcpp.com/2019/05/28/better-macros-better-flags/ +#define ROBIN_HOOD(x) ROBIN_HOOD_PRIVATE_DEFINITION_##x() + +// mark unused members with this macro +#define ROBIN_HOOD_UNUSED(identifier) + +// bitness +#if SIZE_MAX == UINT32_MAX +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 32 +#elif SIZE_MAX == UINT64_MAX +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 64 +#else +#error Unsupported bitness +#endif + +// endianess +#ifdef _MSC_VER +#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1 +#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() \ + (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#endif + +// inline +#ifdef _MSC_VER +#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __declspec(noinline) +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __attribute__((noinline)) +#endif + +// exceptions +#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND) +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 0 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 1 +#endif + +// count leading/trailing bits +#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS) +#ifdef _MSC_VER +#if ROBIN_HOOD(BITNESS) == 32 +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64 +#endif +#include +#pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD)) +#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \ + [](size_t mask) noexcept->int { \ + unsigned long index; \ + return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast(index) \ + : ROBIN_HOOD(BITNESS); \ + } \ + (x) +#else +#if ROBIN_HOOD(BITNESS) == 32 +#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl +#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll +#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll +#endif +#define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) \ + ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS)) +#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \ + ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS)) +#endif +#endif + +// fallthrough +#ifndef __has_cpp_attribute // For backwards compatibility +#define __has_cpp_attribute(x) 0 +#endif +#if __has_cpp_attribute(clang::fallthrough) +#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[clang::fallthrough]] +#elif __has_cpp_attribute(gnu::fallthrough) +#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[gnu::fallthrough]] +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() +#endif + +// likely/unlikely +#ifdef _MSC_VER +#define ROBIN_HOOD_LIKELY(condition) condition +#define ROBIN_HOOD_UNLIKELY(condition) condition +#else +#define ROBIN_HOOD_LIKELY(condition) __builtin_expect(condition, 1) +#define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0) +#endif + +// detect if native wchar_t type is availiable in MSVC +#ifdef _MSC_VER +#ifdef _NATIVE_WCHAR_T_DEFINED +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 0 +#endif +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1 +#endif + +// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor +// being constexpr +#ifdef _MSC_VER +#if _MSC_VER <= 1900 +#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 1 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0 +#endif +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0 +#endif + +// workaround missing "is_trivially_copyable" in g++ < 5.0 +// See https://stackoverflow.com/a/31798726/48181 +#if defined(__GNUC__) && __GNUC__ < 5 +#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__) +#else +#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) \ + std::is_trivially_copyable<__VA_ARGS__>::value +#endif + +// helpers for C++ versions, see +// https://gcc.gnu.org/onlinedocs/cpp/Standard-Predefined-Macros.html +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX() __cplusplus +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX98() 199711L +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX11() 201103L +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX14() 201402L +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX17() 201703L + +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17) +#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() [[nodiscard]] +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() +#endif + +namespace robin_hood { + +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14) +#define ROBIN_HOOD_STD std +#else + +// c++11 compatibility layer +namespace ROBIN_HOOD_STD { +template +struct alignment_of + : std::integral_constant< + std::size_t, alignof(typename std::remove_all_extents::type)> {}; + +template +class integer_sequence { + public: + using value_type = T; + static_assert(std::is_integral::value, "not integral type"); + static constexpr std::size_t size() noexcept { return sizeof...(Ints); } +}; +template +using index_sequence = integer_sequence; + +namespace detail_ { +template +struct IntSeqImpl { + using TValue = T; + static_assert(std::is_integral::value, "not integral type"); + static_assert(Begin >= 0 && Begin < End, + "unexpected argument (Begin<0 || Begin<=End)"); + + template + struct IntSeqCombiner; + + template + struct IntSeqCombiner, + integer_sequence> { + using TResult = integer_sequence; + }; + + using TResult = typename IntSeqCombiner< + typename IntSeqImpl::TResult, + typename IntSeqImpl::TResult>::TResult; +}; + +template +struct IntSeqImpl { + using TValue = T; + static_assert(std::is_integral::value, "not integral type"); + static_assert(Begin >= 0, "unexpected argument (Begin<0)"); + using TResult = integer_sequence; +}; + +template +struct IntSeqImpl { + using TValue = T; + static_assert(std::is_integral::value, "not integral type"); + static_assert(Begin >= 0, "unexpected argument (Begin<0)"); + using TResult = integer_sequence; +}; +} // namespace detail_ + +template +using make_integer_sequence = + typename detail_::IntSeqImpl::TResult; + +template +using make_index_sequence = make_integer_sequence; + +template +using index_sequence_for = make_index_sequence; + +} // namespace ROBIN_HOOD_STD + +#endif + +namespace detail { + +// make sure we static_cast to the correct type for hash_int +#if ROBIN_HOOD(BITNESS) == 64 +using SizeT = uint64_t; +#else +using SizeT = uint32_t; +#endif + +template +T rotr(T x, unsigned k) { + return (x >> k) | (x << (8U * sizeof(T) - k)); +} + +// This cast gets rid of warnings like "cast from 'uint8_t*' {aka 'unsigned +// char*'} to +// 'uint64_t*' {aka 'long unsigned int*'} increases required alignment of target +// type". Use with +// care! +template +inline T reinterpret_cast_no_cast_align_warning(void *ptr) noexcept { + return reinterpret_cast(ptr); +} + +template +inline T reinterpret_cast_no_cast_align_warning(void const *ptr) noexcept { + return reinterpret_cast(ptr); +} + +// make sure this is not inlined as it is slow and dramatically enlarges code, +// thus making other +// inlinings more difficult. Throws are also generally the slow path. +template +[[noreturn]] ROBIN_HOOD(NOINLINE) +#if ROBIN_HOOD(HAS_EXCEPTIONS) + void doThrow(Args &&... args) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay) + throw E(std::forward(args)...); +} +#else + void doThrow(Args &&... ROBIN_HOOD_UNUSED(args) /*unused*/) { + abort(); +} +#endif + +template +T *assertNotNull(T *t, Args &&... args) { + if (ROBIN_HOOD_UNLIKELY(nullptr == t)) { + doThrow(std::forward(args)...); + } + return t; +} + +template +inline T unaligned_load(void const *ptr) noexcept { + // using memcpy so we don't get into unaligned load problems. + // compiler should optimize this very well anyways. + T t; + std::memcpy(&t, ptr, sizeof(T)); + return t; +} + +// Allocates bulks of memory for objects of type T. This deallocates the memory +// in the destructor, +// and keeps a linked list of the allocated memory around. Overhead per +// allocation is the size of a +// pointer. +template +class BulkPoolAllocator { + public: + BulkPoolAllocator() noexcept = default; + + // does not copy anything, just creates a new allocator. + BulkPoolAllocator(const BulkPoolAllocator &ROBIN_HOOD_UNUSED( + o) /*unused*/) noexcept : mHead(nullptr), + mListForFree(nullptr) {} + + BulkPoolAllocator(BulkPoolAllocator &&o) noexcept + : mHead(o.mHead), + mListForFree(o.mListForFree) { + o.mListForFree = nullptr; + o.mHead = nullptr; + } + + BulkPoolAllocator &operator=(BulkPoolAllocator &&o) noexcept { + reset(); + mHead = o.mHead; + mListForFree = o.mListForFree; + o.mListForFree = nullptr; + o.mHead = nullptr; + return *this; + } + + BulkPoolAllocator & + // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) + operator=(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept { + // does not do anything + return *this; + } + + ~BulkPoolAllocator() noexcept { reset(); } + + // Deallocates all allocated memory. + void reset() noexcept { + while (mListForFree) { + T *tmp = *mListForFree; + ROBIN_HOOD_LOG("std::free") + std::free(mListForFree); + mListForFree = reinterpret_cast_no_cast_align_warning(tmp); + } + mHead = nullptr; + } + + // allocates, but does NOT initialize. Use in-place new constructor, e.g. + // T* obj = pool.allocate(); + // ::new (static_cast(obj)) T(); + T *allocate() { + T *tmp = mHead; + if (!tmp) { + tmp = performAllocation(); + } + + mHead = *reinterpret_cast_no_cast_align_warning(tmp); + return tmp; + } + + // does not actually deallocate but puts it in store. + // make sure you have already called the destructor! e.g. with + // obj->~T(); + // pool.deallocate(obj); + void deallocate(T *obj) noexcept { + *reinterpret_cast_no_cast_align_warning(obj) = mHead; + mHead = obj; + } + + // Adds an already allocated block of memory to the allocator. This allocator + // is from now on + // responsible for freeing the data (with free()). If the provided data is not + // large enough to + // make use of, it is immediately freed. Otherwise it is reused and freed in + // the destructor. + void addOrFree(void *ptr, const size_t numBytes) noexcept { + // calculate number of available elements in ptr + if (numBytes < ALIGNMENT + ALIGNED_SIZE) { + // not enough data for at least one element. Free and return. + ROBIN_HOOD_LOG("std::free") + std::free(ptr); + } else { + ROBIN_HOOD_LOG("add to buffer") + add(ptr, numBytes); + } + } + + void swap(BulkPoolAllocator &other) noexcept { + using std::swap; + swap(mHead, other.mHead); + swap(mListForFree, other.mListForFree); + } + + private: + // iterates the list of allocated memory to calculate how many to alloc next. + // Recalculating this each time saves us a size_t member. + // This ignores the fact that memory blocks might have been added manually + // with addOrFree. In + // practice, this should not matter much. + ROBIN_HOOD(NODISCARD) size_t calcNumElementsToAlloc() const noexcept { + auto tmp = mListForFree; + size_t numAllocs = MinNumAllocs; + + while (numAllocs * 2 <= MaxNumAllocs && tmp) { + auto x = reinterpret_cast(tmp); + tmp = *x; + numAllocs *= 2; + } + + return numAllocs; + } + + // WARNING: Underflow if numBytes < ALIGNMENT! This is guarded in addOrFree(). + void add(void *ptr, const size_t numBytes) noexcept { + const size_t numElements = (numBytes - ALIGNMENT) / ALIGNED_SIZE; + + auto data = reinterpret_cast(ptr); + + // link free list + auto x = reinterpret_cast(data); + *x = mListForFree; + mListForFree = data; + + // create linked list for newly allocated data + auto *const headT = reinterpret_cast_no_cast_align_warning( + reinterpret_cast(ptr) + ALIGNMENT); + + auto *const head = reinterpret_cast(headT); + + // Visual Studio compiler automatically unrolls this loop, which is pretty + // cool + for (size_t i = 0; i < numElements; ++i) { + *reinterpret_cast_no_cast_align_warning( + head + i * ALIGNED_SIZE) = head + (i + 1) * ALIGNED_SIZE; + } + + // last one points to 0 + *reinterpret_cast_no_cast_align_warning( + head + (numElements - 1) * ALIGNED_SIZE) = mHead; + mHead = headT; + } + + // Called when no memory is available (mHead == 0). + // Don't inline this slow path. + ROBIN_HOOD(NOINLINE) T *performAllocation() { + size_t const numElementsToAlloc = calcNumElementsToAlloc(); + + // alloc new memory: [prev |T, T, ... T] + size_t const bytes = ALIGNMENT + ALIGNED_SIZE * numElementsToAlloc; + ROBIN_HOOD_LOG("std::malloc " << bytes << " = " << ALIGNMENT << " + " + << ALIGNED_SIZE << " * " + << numElementsToAlloc) + add(assertNotNull(std::malloc(bytes)), bytes); + return mHead; + } + +// enforce byte alignment of the T's +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14) + static constexpr size_t ALIGNMENT = + (std::max)(std::alignment_of::value, std::alignment_of::value); +#else + static const size_t ALIGNMENT = + (ROBIN_HOOD_STD::alignment_of::value > + ROBIN_HOOD_STD::alignment_of::value) + ? ROBIN_HOOD_STD::alignment_of::value + : +ROBIN_HOOD_STD::alignment_of::value; // the + is for + // walkarround +#endif + + static constexpr size_t ALIGNED_SIZE = + ((sizeof(T) - 1) / ALIGNMENT + 1) * ALIGNMENT; + + static_assert(MinNumAllocs >= 1, "MinNumAllocs"); + static_assert(MaxNumAllocs >= MinNumAllocs, "MaxNumAllocs"); + static_assert(ALIGNED_SIZE >= sizeof(T *), "ALIGNED_SIZE"); + static_assert(0 == (ALIGNED_SIZE % sizeof(T *)), "ALIGNED_SIZE mod"); + static_assert(ALIGNMENT >= sizeof(T *), "ALIGNMENT"); + + T *mHead{nullptr}; + T **mListForFree{nullptr}; +}; + +template +struct NodeAllocator; + +// dummy allocator that does nothing +template +struct NodeAllocator { + // we are not using the data, so just free it. + void addOrFree(void *ptr, + size_t ROBIN_HOOD_UNUSED(numBytes) /*unused*/) noexcept { + ROBIN_HOOD_LOG("std::free") + std::free(ptr); + } +}; + +template +struct NodeAllocator + : public BulkPoolAllocator {}; + +// c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it +// either, so I'm making +// my own here. +namespace swappable { +#if ROBIN_HOOD(CXX) < ROBIN_HOOD(CXX17) +using std::swap; +template +struct nothrow { + static const bool value = + noexcept(swap(std::declval(), std::declval())); +}; +#else +template +struct nothrow { + static const bool value = std::is_nothrow_swappable::value; +}; +#endif +} // namespace swappable + +} // namespace detail + +struct is_transparent_tag {}; + +// A custom pair implementation is used in the map because std::pair is not +// is_trivially_copyable, +// which means it would not be allowed to be used in std::memcpy. This struct +// is copyable, which is +// also tested. +template +struct pair { + using first_type = T1; + using second_type = T2; + + template ::value && + std::is_default_constructible::value>::type> + constexpr pair() noexcept(noexcept(U1()) && noexcept(U2())) + : first(), second() {} + + // pair constructors are explicit so we don't accidentally call this ctor when + // we don't have to. + explicit constexpr pair(std::pair const &o) noexcept( + noexcept(T1(std::declval())) && + noexcept(T2(std::declval()))) + : first(o.first), second(o.second) {} + + // pair constructors are explicit so we don't accidentally call this ctor when + // we don't have to. + explicit constexpr pair(std::pair &&o) noexcept( + noexcept(T1(std::move(std::declval()))) && + noexcept(T2(std::move(std::declval())))) + : first(std::move(o.first)), second(std::move(o.second)) {} + + constexpr pair(T1 &&a, T2 &&b) noexcept( + noexcept(T1(std::move(std::declval()))) && + noexcept(T2(std::move(std::declval())))) + : first(std::move(a)), second(std::move(b)) {} + + template + constexpr pair(U1 &&a, U2 &&b) noexcept( + noexcept(T1(std::forward(std::declval()))) && + noexcept(T2(std::forward(std::declval())))) + : first(std::forward(a)), second(std::forward(b)) {} + + template +// MSVC 2015 produces error "C2476: ‘constexpr’ constructor does not initialize +// all members" +// if this constructor is constexpr +#if !ROBIN_HOOD(BROKEN_CONSTEXPR) + constexpr +#endif + pair(std::piecewise_construct_t /*unused*/, std::tuple a, + std::tuple + b) noexcept(noexcept(pair(std::declval &>(), + std::declval &>(), + ROBIN_HOOD_STD::index_sequence_for< + U1...>(), + ROBIN_HOOD_STD::index_sequence_for< + U2...>()))) + : pair(a, b, ROBIN_HOOD_STD::index_sequence_for(), + ROBIN_HOOD_STD::index_sequence_for()) { + } + + // constructor called from the std::piecewise_construct_t ctor + template + pair( + std::tuple &a, std::tuple &b, + ROBIN_HOOD_STD::index_sequence /*unused*/, + ROBIN_HOOD_STD::index_sequence< + I2...> /*unused*/) noexcept(noexcept(T1(std:: + forward(std::get( + std::declval< + std::tuple + &>()))...)) && + noexcept(T2(std::forward(std::get( + std::declval< + std::tuple &>()))...))) + : first(std::forward(std::get(a))...), + second(std::forward(std::get(b))...) { + // make visual studio compiler happy about warning about unused a & b. + // Visual studio's pair implementation disables warning 4100. + (void)a; + (void)b; + } + + void swap(pair &o) noexcept((detail::swappable::nothrow::value) && + (detail::swappable::nothrow::value)) { + using std::swap; + swap(first, o.first); + swap(second, o.second); + } + + T1 first; // NOLINT(misc-non-private-member-variables-in-classes) + T2 second; // NOLINT(misc-non-private-member-variables-in-classes) +}; + +template +inline void swap(pair &a, pair &b) noexcept( + noexcept(std::declval &>().swap(std::declval &>()))) { + a.swap(b); +} + +template +inline constexpr bool operator==(pair const &x, pair const &y) { + return (x.first == y.first) && (x.second == y.second); +} +template +inline constexpr bool operator!=(pair const &x, pair const &y) { + return !(x == y); +} +template +inline constexpr bool +operator<(pair const &x, pair const &y) noexcept( + noexcept(std::declval() < std::declval()) && + noexcept(std::declval() < std::declval())) { + return x.first < y.first || (!(y.first < x.first) && x.second < y.second); +} +template +inline constexpr bool operator>(pair const &x, pair const &y) { + return y < x; +} +template +inline constexpr bool operator<=(pair const &x, pair const &y) { + return !(x > y); +} +template +inline constexpr bool operator>=(pair const &x, pair const &y) { + return !(x < y); +} + +inline size_t hash_bytes(void const *ptr, size_t len) noexcept { + static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995); + static constexpr uint64_t seed = UINT64_C(0xe17a1465); + static constexpr unsigned int r = 47; + + auto const *const data64 = static_cast(ptr); + uint64_t h = seed ^ (len * m); + + size_t const n_blocks = len / 8; + for (size_t i = 0; i < n_blocks; ++i) { + auto k = detail::unaligned_load(data64 + i); + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + auto const *const data8 = + reinterpret_cast(data64 + n_blocks); + switch (len & 7U) { + case 7: + h ^= static_cast(data8[6]) << 48U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 6: + h ^= static_cast(data8[5]) << 40U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 5: + h ^= static_cast(data8[4]) << 32U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 4: + h ^= static_cast(data8[3]) << 24U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 3: + h ^= static_cast(data8[2]) << 16U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 2: + h ^= static_cast(data8[1]) << 8U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 1: + h ^= static_cast(data8[0]); + h *= m; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + default: + break; + } + + h ^= h >> r; + + // not doing the final step here, because this will be done by keyToIdx + // anyways + // h *= m; + // h ^= h >> r; + return static_cast(h); +} + +inline size_t hash_int(uint64_t x) noexcept { + // tried lots of different hashes, let's stick with murmurhash3. It's simple, + // fast, well tested, + // and doesn't need any special 128bit operations. + x ^= x >> 33U; + x *= UINT64_C(0xff51afd7ed558ccd); + x ^= x >> 33U; + + // not doing the final step here, because this will be done by keyToIdx + // anyways + // x *= UINT64_C(0xc4ceb9fe1a85ec53); + // x ^= x >> 33U; + return static_cast(x); +} + +// A thin wrapper around std::hash, performing an additional simple mixing step +// of the result. +template +struct hash : public std::hash { + size_t operator()(T const &obj) const noexcept(noexcept( + std::declval>().operator()(std::declval()))) { + // call base hash + auto result = std::hash::operator()(obj); + // return mixed of that, to be save against identity has + return hash_int(static_cast(result)); + } +}; + +template +struct hash> { + size_t operator()(std::basic_string const &str) const noexcept { + return hash_bytes(str.data(), sizeof(CharT) * str.size()); + } +}; + +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17) +template +struct hash> { + size_t operator()(std::basic_string_view const &sv) const noexcept { + return hash_bytes(sv.data(), sizeof(CharT) * sv.size()); + } +}; +#endif + +template +struct hash { + size_t operator()(T *ptr) const noexcept { + return hash_int(reinterpret_cast(ptr)); + } +}; + +template +struct hash> { + size_t operator()(std::unique_ptr const &ptr) const noexcept { + return hash_int(reinterpret_cast(ptr.get())); + } +}; + +template +struct hash> { + size_t operator()(std::shared_ptr const &ptr) const noexcept { + return hash_int(reinterpret_cast(ptr.get())); + } +}; + +template +struct hash::value>::type> { + size_t operator()(Enum e) const noexcept { + using Underlying = typename std::underlying_type::type; + return hash{}(static_cast(e)); + } +}; + +#define ROBIN_HOOD_HASH_INT(T) \ + template <> \ + struct hash { \ + size_t operator()(T const &obj) const noexcept { \ + return hash_int(static_cast(obj)); \ + } \ + } + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuseless-cast" +#endif +// see https://en.cppreference.com/w/cpp/utility/hash +ROBIN_HOOD_HASH_INT(bool); +ROBIN_HOOD_HASH_INT(char); +ROBIN_HOOD_HASH_INT(signed char); +ROBIN_HOOD_HASH_INT(unsigned char); +ROBIN_HOOD_HASH_INT(char16_t); +ROBIN_HOOD_HASH_INT(char32_t); +#if ROBIN_HOOD(HAS_NATIVE_WCHART) +ROBIN_HOOD_HASH_INT(wchar_t); +#endif +ROBIN_HOOD_HASH_INT(short); +ROBIN_HOOD_HASH_INT(unsigned short); +ROBIN_HOOD_HASH_INT(int); +ROBIN_HOOD_HASH_INT(unsigned int); +ROBIN_HOOD_HASH_INT(long); +ROBIN_HOOD_HASH_INT(long long); +ROBIN_HOOD_HASH_INT(unsigned long); +ROBIN_HOOD_HASH_INT(unsigned long long); +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif +namespace detail { + +template +struct void_type { + using type = void; +}; + +template +struct has_is_transparent : public std::false_type {}; + +template +struct has_is_transparent::type> + : public std::true_type {}; + +// using wrapper classes for hash and key_equal prevents the diamond problem +// when the same type +// is used. see https://stackoverflow.com/a/28771920/48181 +template +struct WrapHash : public T { + WrapHash() = default; + explicit WrapHash(T const &o) noexcept(noexcept(T(std::declval()))) + : T(o) {} +}; + +template +struct WrapKeyEqual : public T { + WrapKeyEqual() = default; + explicit WrapKeyEqual(T const &o) noexcept( + noexcept(T(std::declval()))) + : T(o) {} +}; + +// A highly optimized hashmap implementation, using the Robin Hood algorithm. +// +// In most cases, this map should be usable as a drop-in replacement for +// std::unordered_map, but +// be about 2x faster in most cases and require much less allocations. +// +// This implementation uses the following memory layout: +// +// [Node, Node, ... Node | info, info, ... infoSentinel ] +// +// * Node: either a DataNode that directly has the std::pair as +// member, +// or a DataNode with a pointer to std::pair. Which DataNode +// representation to use +// depends on how fast the swap() operation is. Heuristically, this is +// automatically choosen +// based on sizeof(). there are always 2^n Nodes. +// +// * info: Each Node in the map has a corresponding info byte, so there are 2^n +// info bytes. +// Each byte is initialized to 0, meaning the corresponding Node is empty. Set +// to 1 means the +// corresponding node contains data. Set to 2 means the corresponding Node is +// filled, but it +// actually belongs to the previous position and was pushed out because that +// place is already +// taken. +// +// * infoSentinel: Sentinel byte set to 1, so that iterator's ++ can stop at +// end() without the +// need for a idx variable. +// +// According to STL, order of templates has effect on throughput. That's why +// I've moved the +// boolean to the front. +// https://www.reddit.com/r/cpp/comments/ahp6iu/compile_time_binary_size_reductions_and_cs_future/eeguck4/ +template +class Table + : public WrapHash, + public WrapKeyEqual, + detail::NodeAllocator< + typename std::conditional< + std::is_void::value, Key, + robin_hood::pair< + typename std::conditional::type, + T>>::type, + 4, 16384, IsFlat> { + public: + static constexpr bool is_flat = IsFlat; + static constexpr bool is_map = !std::is_void::value; + static constexpr bool is_set = !is_map; + static constexpr bool is_transparent = + has_is_transparent::value && has_is_transparent::value; + + using key_type = Key; + using mapped_type = T; + using value_type = typename std::conditional< + is_set, Key, + robin_hood::pair::type, + T>>::type; + using size_type = size_t; + using hasher = Hash; + using key_equal = KeyEqual; + using Self = + Table; + + private: + static_assert(MaxLoadFactor100 > 10 && MaxLoadFactor100 < 100, + "MaxLoadFactor100 needs to be >10 && < 100"); + + using WHash = WrapHash; + using WKeyEqual = WrapKeyEqual; + + // configuration defaults + + // make sure we have 8 elements, needed to quickly rehash mInfo + static constexpr size_t InitialNumElements = sizeof(uint64_t); + static constexpr uint32_t InitialInfoNumBits = 5; + static constexpr uint8_t InitialInfoInc = 1U << InitialInfoNumBits; + static constexpr size_t InfoMask = InitialInfoInc - 1U; + static constexpr uint8_t InitialInfoHashShift = 0; + using DataPool = detail::NodeAllocator; + + // type needs to be wider than uint8_t. + using InfoType = uint32_t; + + // DataNode //////////////////////////////////////////////////////// + + // Primary template for the data node. We have special implementations for + // small and big + // objects. For large objects it is assumed that swap() is fairly slow, so we + // allocate these + // on the heap so swap merely swaps a pointer. + template + class DataNode {}; + + // Small: just allocate on the stack. + template + class DataNode final { + public: + template + explicit DataNode( + M &ROBIN_HOOD_UNUSED(map) /*unused*/, + Args &&... args) noexcept(noexcept(value_type(std:: + forward( + args)...))) + : mData(std::forward(args)...) {} + + DataNode( + M &ROBIN_HOOD_UNUSED(map) /*unused*/, + DataNode + &&n) noexcept(std::is_nothrow_move_constructible::value) + : mData(std::move(n.mData)) {} + + // doesn't do anything + void destroy(M &ROBIN_HOOD_UNUSED(map) /*unused*/) noexcept {} + void destroyDoNotDeallocate() noexcept {} + + value_type const *operator->() const noexcept { return &mData; } + value_type *operator->() noexcept { return &mData; } + + const value_type &operator*() const noexcept { return mData; } + + value_type &operator*() noexcept { return mData; } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() noexcept { + return mData.first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() noexcept { + return mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() const noexcept { + return mData.first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() const + noexcept { + return mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() noexcept { + return mData.second; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() const + noexcept { + return mData.second; + } + + void swap(DataNode &o) noexcept( + noexcept(std::declval().swap(std::declval()))) { + mData.swap(o.mData); + } + + private: + value_type mData; + }; + + // big object: allocate on heap. + template + class DataNode { + public: + template + explicit DataNode(M &map, Args &&... args) : mData(map.allocate()) { + ::new (static_cast(mData)) + value_type(std::forward(args)...); + } + + DataNode(M &ROBIN_HOOD_UNUSED(map) /*unused*/, + DataNode &&n) noexcept : mData(std::move(n.mData)) {} + + void destroy(M &map) noexcept { + // don't deallocate, just put it into list of datapool. + mData->~value_type(); + map.deallocate(mData); + } + + void destroyDoNotDeallocate() noexcept { mData->~value_type(); } + + value_type const *operator->() const noexcept { return mData; } + + value_type *operator->() noexcept { return mData; } + + const value_type &operator*() const { return *mData; } + + value_type &operator*() { return *mData; } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() noexcept { + return mData->first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() noexcept { + return *mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() const noexcept { + return mData->first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() const + noexcept { + return *mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() noexcept { + return mData->second; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() const + noexcept { + return mData->second; + } + + void swap(DataNode &o) noexcept { + using std::swap; + swap(mData, o.mData); + } + + private: + value_type *mData; + }; + + using Node = DataNode; + + // helpers for insertKeyPrepareEmptySpot: extract first entry (only const + // required) + ROBIN_HOOD(NODISCARD) + key_type const &getFirstConst(Node const &n) const noexcept { + return n.getFirst(); + } + + // in case we have void mapped_type, we are not using a pair, thus we just + // route k through. + // No need to disable this because it's just not used if not applicable. + ROBIN_HOOD(NODISCARD) + key_type const &getFirstConst(key_type const &k) const noexcept { return k; } + + // in case we have non-void mapped_type, we have a standard robin_hood::pair + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::value, key_type const &>::type + getFirstConst(value_type const &vt) const noexcept { + return vt.first; + } + + // Cloner ////////////////////////////////////////////////////////// + + template + struct Cloner; + + // fast path: Just copy data, without allocating anything. + template + struct Cloner { + void operator()(M const &source, M &target) const { + auto const *const src = reinterpret_cast(source.mKeyVals); + auto *tgt = reinterpret_cast(target.mKeyVals); + auto const numElementsWithBuffer = + target.calcNumElementsWithBuffer(target.mMask + 1); + std::copy(src, src + target.calcNumBytesTotal(numElementsWithBuffer), + tgt); + } + }; + + template + struct Cloner { + void operator()(M const &s, M &t) const { + auto const numElementsWithBuffer = + t.calcNumElementsWithBuffer(t.mMask + 1); + std::copy(s.mInfo, s.mInfo + t.calcNumBytesInfo(numElementsWithBuffer), + t.mInfo); + + for (size_t i = 0; i < numElementsWithBuffer; ++i) { + if (t.mInfo[i]) { + ::new (static_cast(t.mKeyVals + i)) Node(t, *s.mKeyVals[i]); + } + } + } + }; + + // Destroyer /////////////////////////////////////////////////////// + + template + struct Destroyer {}; + + template + struct Destroyer { + void nodes(M &m) const noexcept { m.mNumElements = 0; } + + void nodesDoNotDeallocate(M &m) const noexcept { m.mNumElements = 0; } + }; + + template + struct Destroyer { + void nodes(M &m) const noexcept { + m.mNumElements = 0; + // clear also resets mInfo to 0, that's sometimes not necessary. + auto const numElementsWithBuffer = + m.calcNumElementsWithBuffer(m.mMask + 1); + + for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) { + if (0 != m.mInfo[idx]) { + Node &n = m.mKeyVals[idx]; + n.destroy(m); + n.~Node(); + } + } + } + + void nodesDoNotDeallocate(M &m) const noexcept { + m.mNumElements = 0; + // clear also resets mInfo to 0, that's sometimes not necessary. + auto const numElementsWithBuffer = + m.calcNumElementsWithBuffer(m.mMask + 1); + for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) { + if (0 != m.mInfo[idx]) { + Node &n = m.mKeyVals[idx]; + n.destroyDoNotDeallocate(); + n.~Node(); + } + } + } + }; + + // Iter //////////////////////////////////////////////////////////// + + struct fast_forward_tag {}; + + // generic iterator for both const_iterator and iterator. + template + // NOLINTNEXTLINE(hicpp-special-member-functions,cppcoreguidelines-special-member-functions) + class Iter { + private: + using NodePtr = + typename std::conditional::type; + + public: + using difference_type = std::ptrdiff_t; + using value_type = typename Self::value_type; + using reference = typename std::conditional::type; + using pointer = typename std::conditional::type; + using iterator_category = std::forward_iterator_tag; + + // default constructed iterator can be compared to itself, but WON'T return + // true when + // compared to end(). + Iter() = default; + + // Rule of zero: nothing specified. The conversion constructor is only + // enabled for + // iterator to const_iterator, so it doesn't accidentally work as a copy + // ctor. + + // Conversion constructor from iterator to const_iterator. + template ::type> + // NOLINTNEXTLINE(hicpp-explicit-conversions) + Iter(Iter const &other) noexcept : mKeyVals(other.mKeyVals), + mInfo(other.mInfo) {} + + Iter(NodePtr valPtr, uint8_t const *infoPtr) noexcept : mKeyVals(valPtr), + mInfo(infoPtr) {} + + Iter(NodePtr valPtr, uint8_t const *infoPtr, + fast_forward_tag ROBIN_HOOD_UNUSED(tag) /*unused*/) noexcept + : mKeyVals(valPtr), + mInfo(infoPtr) { + fastForward(); + } + + template ::type> + Iter &operator=(Iter const &other) noexcept { + mKeyVals = other.mKeyVals; + mInfo = other.mInfo; + return *this; + } + + // prefix increment. Undefined behavior if we are at end()! + Iter &operator++() noexcept { + mInfo++; + mKeyVals++; + fastForward(); + return *this; + } + + Iter operator++(int)noexcept { + Iter tmp = *this; + ++(*this); + return tmp; + } + + reference operator*() const { return **mKeyVals; } + + pointer operator->() const { return &**mKeyVals; } + + template + bool operator==(Iter const &o) const noexcept { + return mKeyVals == o.mKeyVals; + } + + template + bool operator!=(Iter const &o) const noexcept { + return mKeyVals != o.mKeyVals; + } + + private: + // fast forward to the next non-free info byte + // I've tried a few variants that don't depend on intrinsics, but + // unfortunately they are + // quite a bit slower than this one. So I've reverted that change again. See + // map_benchmark. + void fastForward() noexcept { + size_t n = 0; + while (0U == (n = detail::unaligned_load(mInfo))) { + mInfo += sizeof(size_t); + mKeyVals += sizeof(size_t); + } +#if defined(ROBIN_HOOD_DISABLE_INTRINSICS) + // we know for certain that within the next 8 bytes we'll find a non-zero + // one. + if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load(mInfo))) { + mInfo += 4; + mKeyVals += 4; + } + if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load(mInfo))) { + mInfo += 2; + mKeyVals += 2; + } + if (ROBIN_HOOD_UNLIKELY(0U == *mInfo)) { + mInfo += 1; + mKeyVals += 1; + } +#else +#if ROBIN_HOOD(LITTLE_ENDIAN) + auto inc = ROBIN_HOOD_COUNT_TRAILING_ZEROES(n) / 8; +#else + auto inc = ROBIN_HOOD_COUNT_LEADING_ZEROES(n) / 8; +#endif + mInfo += inc; + mKeyVals += inc; +#endif + } + + friend class Table; + NodePtr mKeyVals{nullptr}; + uint8_t const *mInfo{nullptr}; + }; + + //////////////////////////////////////////////////////////////////// + + // highly performance relevant code. + // Lower bits are used for indexing into the array (2^n size) + // The upper 1-5 bits need to be a reasonable good hash, to save comparisons. + template + void keyToIdx(HashKey &&key, size_t *idx, InfoType *info) const { + // In addition to whatever hash is used, add another mul & shift so we get + // better hashing. + // This serves as a bad hash prevention, if the given data is + // badly mixed. + auto h = static_cast(WHash::operator()(key)); + + h *= mHashMultiplier; + h ^= h >> 33U; + + // the lower InitialInfoNumBits are reserved for info. + *info = mInfoInc + static_cast((h & InfoMask) >> mInfoHashShift); + *idx = (static_cast(h) >> InitialInfoNumBits) & mMask; + } + + // forwards the index by one, wrapping around at the end + void next(InfoType *info, size_t *idx) const noexcept { + *idx = *idx + 1; + *info += mInfoInc; + } + + void nextWhileLess(InfoType *info, size_t *idx) const noexcept { + // unrolling this by hand did not bring any speedups. + while (*info < mInfo[*idx]) { + next(info, idx); + } + } + + // Shift everything up by one element. Tries to move stuff around. + void shiftUp(size_t startIdx, size_t const insertion_idx) noexcept( + std::is_nothrow_move_assignable::value) { + auto idx = startIdx; + ::new (static_cast(mKeyVals + idx)) + Node(std::move(mKeyVals[idx - 1])); + while (--idx != insertion_idx) { + mKeyVals[idx] = std::move(mKeyVals[idx - 1]); + } + + idx = startIdx; + while (idx != insertion_idx) { + ROBIN_HOOD_COUNT(shiftUp) + mInfo[idx] = static_cast(mInfo[idx - 1] + mInfoInc); + if (ROBIN_HOOD_UNLIKELY(mInfo[idx] + mInfoInc > 0xFF)) { + mMaxNumElementsAllowed = 0; + } + --idx; + } + } + + void shiftDown(size_t idx) noexcept( + std::is_nothrow_move_assignable::value) { + // until we find one that is either empty or has zero offset. + // TODO(martinus) we don't need to move everything, just the last one for + // the same + // bucket. + mKeyVals[idx].destroy(*this); + + // until we find one that is either empty or has zero offset. + while (mInfo[idx + 1] >= 2 * mInfoInc) { + ROBIN_HOOD_COUNT(shiftDown) + mInfo[idx] = static_cast(mInfo[idx + 1] - mInfoInc); + mKeyVals[idx] = std::move(mKeyVals[idx + 1]); + ++idx; + } + + mInfo[idx] = 0; + // don't destroy, we've moved it + // mKeyVals[idx].destroy(*this); + mKeyVals[idx].~Node(); + } + + // copy of find(), except that it returns iterator instead of const_iterator. + template + ROBIN_HOOD(NODISCARD) + size_t findIdx(Other const &key) const { + size_t idx{}; + InfoType info{}; + keyToIdx(key, &idx, &info); + + do { + // unrolling this twice gives a bit of a speedup. More unrolling did not + // help. + if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()( + key, mKeyVals[idx].getFirst()))) { + return idx; + } + next(&info, &idx); + if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()( + key, mKeyVals[idx].getFirst()))) { + return idx; + } + next(&info, &idx); + } while (info <= mInfo[idx]); + + // nothing found! + return mMask == 0 + ? 0 + : static_cast(std::distance( + mKeyVals, + reinterpret_cast_no_cast_align_warning(mInfo))); + } + + void cloneData(const Table &o) { + Cloner()(o, *this); + } + + // inserts a keyval that is guaranteed to be new, e.g. when the hashmap is + // resized. + // @return True on success, false if something went wrong + void insert_move(Node &&keyval) { + // we don't retry, fail if overflowing + // don't need to check max num elements + if (0 == mMaxNumElementsAllowed && !try_increase_info()) { + throwOverflowError(); + } + + size_t idx{}; + InfoType info{}; + keyToIdx(keyval.getFirst(), &idx, &info); + + // skip forward. Use <= because we are certain that the element is not + // there. + while (info <= mInfo[idx]) { + idx = idx + 1; + info += mInfoInc; + } + + // key not found, so we are now exactly where we want to insert it. + auto const insertion_idx = idx; + auto const insertion_info = static_cast(info); + if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) { + mMaxNumElementsAllowed = 0; + } + + // find an empty spot + while (0 != mInfo[idx]) { + next(&info, &idx); + } + + auto &l = mKeyVals[insertion_idx]; + if (idx == insertion_idx) { + ::new (static_cast(&l)) Node(std::move(keyval)); + } else { + shiftUp(idx, insertion_idx); + l = std::move(keyval); + } + + // put at empty spot + mInfo[insertion_idx] = insertion_info; + + ++mNumElements; + } + + public: + using iterator = Iter; + using const_iterator = Iter; + + Table() noexcept(noexcept(Hash()) && noexcept(KeyEqual())) + : WHash(), WKeyEqual() { + ROBIN_HOOD_TRACE(this) + } + + // Creates an empty hash map. Nothing is allocated yet, this happens at the + // first insert. + // This tremendously speeds up ctor & dtor of a map that never receives an + // element. The + // penalty is payed at the first insert, and not before. Lookup of this empty + // map works + // because everybody points to DummyInfoByte::b. parameter bucket_count is + // dictated by the + // standard, but we can ignore it. + explicit Table( + size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/, const Hash &h = Hash{}, + const KeyEqual &equal = KeyEqual{}) noexcept(noexcept(Hash(h)) && + noexcept(KeyEqual(equal))) + : WHash(h), WKeyEqual(equal) { + ROBIN_HOOD_TRACE(this) + } + + template + Table(Iter first, Iter last, + size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0, + const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{}) + : WHash(h), WKeyEqual(equal) { + ROBIN_HOOD_TRACE(this) + insert(first, last); + } + + Table(std::initializer_list initlist, + size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0, + const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{}) + : WHash(h), WKeyEqual(equal) { + ROBIN_HOOD_TRACE(this) + insert(initlist.begin(), initlist.end()); + } + + Table(Table &&o) noexcept : WHash(std::move(static_cast(o))), + WKeyEqual(std::move(static_cast(o))), + DataPool(std::move(static_cast(o))) { + ROBIN_HOOD_TRACE(this) + if (o.mMask) { + mHashMultiplier = std::move(o.mHashMultiplier); + mKeyVals = std::move(o.mKeyVals); + mInfo = std::move(o.mInfo); + mNumElements = std::move(o.mNumElements); + mMask = std::move(o.mMask); + mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed); + mInfoInc = std::move(o.mInfoInc); + mInfoHashShift = std::move(o.mInfoHashShift); + // set other's mask to 0 so its destructor won't do anything + o.init(); + } + } + + Table &operator=(Table &&o) noexcept { + ROBIN_HOOD_TRACE(this) + if (&o != this) { + if (o.mMask) { + // only move stuff if the other map actually has some data + destroy(); + mHashMultiplier = std::move(o.mHashMultiplier); + mKeyVals = std::move(o.mKeyVals); + mInfo = std::move(o.mInfo); + mNumElements = std::move(o.mNumElements); + mMask = std::move(o.mMask); + mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed); + mInfoInc = std::move(o.mInfoInc); + mInfoHashShift = std::move(o.mInfoHashShift); + WHash::operator=(std::move(static_cast(o))); + WKeyEqual::operator=(std::move(static_cast(o))); + DataPool::operator=(std::move(static_cast(o))); + + o.init(); + + } else { + // nothing in the other map => just clear us. + clear(); + } + } + return *this; + } + + Table(const Table &o) + : WHash(static_cast(o)), + WKeyEqual(static_cast(o)), + DataPool(static_cast(o)) { + ROBIN_HOOD_TRACE(this) + if (!o.empty()) { + // not empty: create an exact copy. it is also possible to just iterate + // through all + // elements and insert them, but copying is probably faster. + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1); + auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer); + + ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal(" + << numElementsWithBuffer << ")") + mHashMultiplier = o.mHashMultiplier; + mKeyVals = static_cast( + detail::assertNotNull(std::malloc(numBytesTotal))); + // no need for calloc because clonData does memcpy + mInfo = reinterpret_cast(mKeyVals + numElementsWithBuffer); + mNumElements = o.mNumElements; + mMask = o.mMask; + mMaxNumElementsAllowed = o.mMaxNumElementsAllowed; + mInfoInc = o.mInfoInc; + mInfoHashShift = o.mInfoHashShift; + cloneData(o); + } + } + + // Creates a copy of the given map. Copy constructor of each entry is used. + // Not sure why clang-tidy thinks this doesn't handle self assignment, it does + // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) + Table &operator=(Table const &o) { + ROBIN_HOOD_TRACE(this) + if (&o == this) { + // prevent assigning of itself + return *this; + } + + // we keep using the old allocator and not assign the new one, because we + // want to keep + // the memory available. when it is the same size. + if (o.empty()) { + if (0 == mMask) { + // nothing to do, we are empty too + return *this; + } + + // not empty: destroy what we have there + // clear also resets mInfo to 0, that's sometimes not necessary. + destroy(); + init(); + WHash::operator=(static_cast(o)); + WKeyEqual::operator=(static_cast(o)); + DataPool::operator=(static_cast(o)); + + return *this; + } + + // clean up old stuff + Destroyer::value>{} + .nodes(*this); + + if (mMask != o.mMask) { + // no luck: we don't have the same array size allocated, so we need to + // realloc. + if (0 != mMask) { + // only deallocate if we actually have data! + ROBIN_HOOD_LOG("std::free") + std::free(mKeyVals); + } + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1); + auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer); + ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal(" + << numElementsWithBuffer << ")") + mKeyVals = static_cast( + detail::assertNotNull(std::malloc(numBytesTotal))); + + // no need for calloc here because cloneData performs a memcpy. + mInfo = reinterpret_cast(mKeyVals + numElementsWithBuffer); + // sentinel is set in cloneData + } + WHash::operator=(static_cast(o)); + WKeyEqual::operator=(static_cast(o)); + DataPool::operator=(static_cast(o)); + mHashMultiplier = o.mHashMultiplier; + mNumElements = o.mNumElements; + mMask = o.mMask; + mMaxNumElementsAllowed = o.mMaxNumElementsAllowed; + mInfoInc = o.mInfoInc; + mInfoHashShift = o.mInfoHashShift; + cloneData(o); + + return *this; + } + + // Swaps everything between the two maps. + void swap(Table &o) { + ROBIN_HOOD_TRACE(this) + using std::swap; + swap(o, *this); + } + + // Clears all data, without resizing. + void clear() { + ROBIN_HOOD_TRACE(this) + if (empty()) { + // don't do anything! also important because we don't want to write to + // DummyInfoByte::b, even though we would just write 0 to it. + return; + } + + Destroyer::value>{} + .nodes(*this); + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1); + // clear everything, then set the sentinel again + uint8_t const z = 0; + std::fill(mInfo, mInfo + calcNumBytesInfo(numElementsWithBuffer), z); + mInfo[numElementsWithBuffer] = 1; + + mInfoInc = InitialInfoInc; + mInfoHashShift = InitialInfoHashShift; + } + + // Destroys the map and all it's contents. + ~Table() { + ROBIN_HOOD_TRACE(this) + destroy(); + } + + // Checks if both tables contain the same entries. Order is irrelevant. + bool operator==(const Table &other) const { + ROBIN_HOOD_TRACE(this) + if (other.size() != size()) { + return false; + } + for (auto const &otherEntry : other) { + if (!has(otherEntry)) { + return false; + } + } + + return true; + } + + bool operator!=(const Table &other) const { + ROBIN_HOOD_TRACE(this) + return !operator==(other); + } + + template + typename std::enable_if::value, Q &>::type operator[]( + const key_type &key) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::piecewise_construct, std::forward_as_tuple(key), + std::forward_as_tuple()); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = + Node(*this, std::piecewise_construct, std::forward_as_tuple(key), + std::forward_as_tuple()); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + } + + return mKeyVals[idxAndState.first].getSecond(); + } + + template + typename std::enable_if::value, Q &>::type operator[]( + key_type &&key) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) Node( + *this, std::piecewise_construct, + std::forward_as_tuple(std::move(key)), std::forward_as_tuple()); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = Node( + *this, std::piecewise_construct, + std::forward_as_tuple(std::move(key)), std::forward_as_tuple()); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + } + + return mKeyVals[idxAndState.first].getSecond(); + } + + template + void insert(Iter first, Iter last) { + for (; first != last; ++first) { + // value_type ctor needed because this might be called with std::pair's + insert(value_type(*first)); + } + } + + void insert(std::initializer_list ilist) { + for (auto &&vt : ilist) { + insert(std::move(vt)); + } + } + + template + std::pair emplace(Args &&... args) { + ROBIN_HOOD_TRACE(this) + Node n{*this, std::forward(args)...}; + auto idxAndState = insertKeyPrepareEmptySpot(getFirstConst(n)); + switch (idxAndState.second) { + case InsertionState::key_found: + n.destroy(*this); + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::move(n)); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = std::move(n); + break; + + case InsertionState::overflow_error: + n.destroy(*this); + throwOverflowError(); + break; + } + + return std::make_pair( + iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first), + InsertionState::key_found != idxAndState.second); + } + + template + std::pair try_emplace(const key_type &key, Args &&... args) { + return try_emplace_impl(key, std::forward(args)...); + } + + template + std::pair try_emplace(key_type &&key, Args &&... args) { + return try_emplace_impl(std::move(key), std::forward(args)...); + } + + template + std::pair try_emplace(const_iterator hint, + const key_type &key, Args &&... args) { + (void)hint; + return try_emplace_impl(key, std::forward(args)...); + } + + template + std::pair try_emplace(const_iterator hint, key_type &&key, + Args &&... args) { + (void)hint; + return try_emplace_impl(std::move(key), std::forward(args)...); + } + + template + std::pair insert_or_assign(const key_type &key, + Mapped &&obj) { + return insertOrAssignImpl(key, std::forward(obj)); + } + + template + std::pair insert_or_assign(key_type &&key, Mapped &&obj) { + return insertOrAssignImpl(std::move(key), std::forward(obj)); + } + + template + std::pair insert_or_assign(const_iterator hint, + const key_type &key, + Mapped &&obj) { + (void)hint; + return insertOrAssignImpl(key, std::forward(obj)); + } + + template + std::pair insert_or_assign(const_iterator hint, + key_type &&key, Mapped &&obj) { + (void)hint; + return insertOrAssignImpl(std::move(key), std::forward(obj)); + } + + std::pair insert(const value_type &keyval) { + ROBIN_HOOD_TRACE(this) + return emplace(keyval); + } + + std::pair insert(value_type &&keyval) { + return emplace(std::move(keyval)); + } + + // Returns 1 if key is found, 0 otherwise. + size_t count(const key_type &key) const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv != reinterpret_cast_no_cast_align_warning(mInfo)) { + return 1; + } + return 0; + } + + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::type count( + const OtherKey &key) const { + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv != reinterpret_cast_no_cast_align_warning(mInfo)) { + return 1; + } + return 0; + } + + bool contains(const key_type &key) const { // NOLINT(modernize-use-nodiscard) + return 1U == count(key); + } + + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::type contains( + const OtherKey &key) const { + return 1U == count(key); + } + + // Returns a reference to the value found for key. + // Throws std::out_of_range if element cannot be found + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::value, Q &>::type at( + key_type const &key) { + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv == reinterpret_cast_no_cast_align_warning(mInfo)) { + doThrow("key not found"); + } + return kv->getSecond(); + } + + // Returns a reference to the value found for key. + // Throws std::out_of_range if element cannot be found + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::value, Q const &>::type at( + key_type const &key) const { + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv == reinterpret_cast_no_cast_align_warning(mInfo)) { + doThrow("key not found"); + } + return kv->getSecond(); + } + + const_iterator find( + const key_type &key) const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return const_iterator{mKeyVals + idx, mInfo + idx}; + } + + template + const_iterator find(const OtherKey &key, + is_transparent_tag /*unused*/) const { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return const_iterator{mKeyVals + idx, mInfo + idx}; + } + + template + typename std::enable_if< + Self_::is_transparent, // NOLINT(modernize-use-nodiscard) + const_iterator>::type // NOLINT(modernize-use-nodiscard) + find(const OtherKey &key) const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return const_iterator{mKeyVals + idx, mInfo + idx}; + } + + iterator find(const key_type &key) { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return iterator{mKeyVals + idx, mInfo + idx}; + } + + template + iterator find(const OtherKey &key, is_transparent_tag /*unused*/) { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return iterator{mKeyVals + idx, mInfo + idx}; + } + + template + typename std::enable_if::type find( + const OtherKey &key) { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return iterator{mKeyVals + idx, mInfo + idx}; + } + + iterator begin() { + ROBIN_HOOD_TRACE(this) + if (empty()) { + return end(); + } + return iterator(mKeyVals, mInfo, fast_forward_tag{}); + } + const_iterator begin() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return cbegin(); + } + const_iterator cbegin() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + if (empty()) { + return cend(); + } + return const_iterator(mKeyVals, mInfo, fast_forward_tag{}); + } + + iterator end() { + ROBIN_HOOD_TRACE(this) + // no need to supply valid info pointer: end() must not be dereferenced, and + // only node + // pointer is compared. + return iterator{reinterpret_cast_no_cast_align_warning(mInfo), + nullptr}; + } + const_iterator end() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return cend(); + } + const_iterator cend() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return const_iterator{reinterpret_cast_no_cast_align_warning(mInfo), + nullptr}; + } + + iterator erase(const_iterator pos) { + ROBIN_HOOD_TRACE(this) + // its safe to perform const cast here + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) + return erase(iterator{const_cast(pos.mKeyVals), + const_cast(pos.mInfo)}); + } + + // Erases element at pos, returns iterator to the next element. + iterator erase(iterator pos) { + ROBIN_HOOD_TRACE(this) + // we assume that pos always points to a valid entry, and not end(). + auto const idx = static_cast(pos.mKeyVals - mKeyVals); + + shiftDown(idx); + --mNumElements; + + if (*pos.mInfo) { + // we've backward shifted, return this again + return pos; + } + + // no backward shift, return next element + return ++pos; + } + + size_t erase(const key_type &key) { + ROBIN_HOOD_TRACE(this) + size_t idx{}; + InfoType info{}; + keyToIdx(key, &idx, &info); + + // check while info matches with the source idx + do { + if (info == mInfo[idx] && + WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) { + shiftDown(idx); + --mNumElements; + return 1; + } + next(&info, &idx); + } while (info <= mInfo[idx]); + + // nothing found to delete + return 0; + } + + // reserves space for the specified number of elements. Makes sure the old + // data fits. + // exactly the same as reserve(c). + void rehash(size_t c) { + // forces a reserve + reserve(c, true); + } + + // reserves space for the specified number of elements. Makes sure the old + // data fits. + // Exactly the same as rehash(c). Use rehash(0) to shrink to fit. + void reserve(size_t c) { + // reserve, but don't force rehash + reserve(c, false); + } + + // If possible reallocates the map to a smaller one. This frees the underlying + // table. + // Does not do anything if load_factor is too large for decreasing the table's + // size. + void compact() { + ROBIN_HOOD_TRACE(this) + auto newSize = InitialNumElements; + while (calcMaxNumElementsAllowed(newSize) < mNumElements && newSize != 0) { + newSize *= 2; + } + if (ROBIN_HOOD_UNLIKELY(newSize == 0)) { + throwOverflowError(); + } + + ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask + << " + 1") + + // only actually do anything when the new size is bigger than the old one. + // This prevents to + // continuously allocate for each reserve() call. + if (newSize < mMask + 1) { + rehashPowerOfTwo(newSize, true); + } + } + + size_type size() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return mNumElements; + } + + size_type max_size() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return static_cast(-1); + } + + ROBIN_HOOD(NODISCARD) bool empty() const noexcept { + ROBIN_HOOD_TRACE(this) + return 0 == mNumElements; + } + + float max_load_factor() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return MaxLoadFactor100 / 100.0F; + } + + // Average number of elements per bucket. Since we allow only 1 per bucket + float load_factor() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return static_cast(size()) / static_cast(mMask + 1); + } + + ROBIN_HOOD(NODISCARD) size_t mask() const noexcept { + ROBIN_HOOD_TRACE(this) + return mMask; + } + + ROBIN_HOOD(NODISCARD) + size_t calcMaxNumElementsAllowed(size_t maxElements) const noexcept { + if (ROBIN_HOOD_LIKELY(maxElements <= + (std::numeric_limits::max)() / 100)) { + return maxElements * MaxLoadFactor100 / 100; + } + + // we might be a bit inprecise, but since maxElements is quite large that + // doesn't matter + return (maxElements / 100) * MaxLoadFactor100; + } + + ROBIN_HOOD(NODISCARD) + size_t calcNumBytesInfo(size_t numElements) const noexcept { + // we add a uint64_t, which houses the sentinel (first byte) and padding so + // we can load + // 64bit types. + return numElements + sizeof(uint64_t); + } + + ROBIN_HOOD(NODISCARD) + size_t calcNumElementsWithBuffer(size_t numElements) const noexcept { + auto maxNumElementsAllowed = calcMaxNumElementsAllowed(numElements); + return numElements + + (std::min)(maxNumElementsAllowed, (static_cast(0xFF))); + } + + // calculation only allowed for 2^n values + ROBIN_HOOD(NODISCARD) size_t calcNumBytesTotal(size_t numElements) const { +#if ROBIN_HOOD(BITNESS) == 64 + return numElements * sizeof(Node) + calcNumBytesInfo(numElements); +#else + // make sure we're doing 64bit operations, so we are at least safe against + // 32bit overflows. + auto const ne = static_cast(numElements); + auto const s = static_cast(sizeof(Node)); + auto const infos = static_cast(calcNumBytesInfo(numElements)); + + auto const total64 = ne * s + infos; + auto const total = static_cast(total64); + + if (ROBIN_HOOD_UNLIKELY(static_cast(total) != total64)) { + throwOverflowError(); + } + return total; +#endif + } + + private: + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::value, bool>::type + has(const value_type &e) const { + ROBIN_HOOD_TRACE(this) + auto it = find(e.first); + return it != end() && it->second == e.second; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::value, bool>::type + has(const value_type &e) const { + ROBIN_HOOD_TRACE(this) + return find(e) != end(); + } + + void reserve(size_t c, bool forceRehash) { + ROBIN_HOOD_TRACE(this) + auto const minElementsAllowed = (std::max)(c, mNumElements); + auto newSize = InitialNumElements; + while (calcMaxNumElementsAllowed(newSize) < minElementsAllowed && + newSize != 0) { + newSize *= 2; + } + if (ROBIN_HOOD_UNLIKELY(newSize == 0)) { + throwOverflowError(); + } + + ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask + << " + 1") + + // only actually do anything when the new size is bigger than the old one. + // This prevents to + // continuously allocate for each reserve() call. + if (forceRehash || newSize > mMask + 1) { + rehashPowerOfTwo(newSize, false); + } + } + + // reserves space for at least the specified number of elements. + // only works if numBuckets if power of two + // True on success, false otherwise + void rehashPowerOfTwo(size_t numBuckets, bool forceFree) { + ROBIN_HOOD_TRACE(this) + + Node *const oldKeyVals = mKeyVals; + uint8_t const *const oldInfo = mInfo; + + const size_t oldMaxElementsWithBuffer = + calcNumElementsWithBuffer(mMask + 1); + + // resize operation: move stuff + initData(numBuckets); + if (oldMaxElementsWithBuffer > 1) { + for (size_t i = 0; i < oldMaxElementsWithBuffer; ++i) { + if (oldInfo[i] != 0) { + // might throw an exception, which is really bad since we are in the + // middle of + // moving stuff. + insert_move(std::move(oldKeyVals[i])); + // destroy the node but DON'T destroy the data. + oldKeyVals[i].~Node(); + } + } + + // this check is not necessary as it's guarded by the previous if, but it + // helps + // silence g++'s overeager "attempt to free a non-heap object 'map' + // [-Werror=free-nonheap-object]" warning. + if (oldKeyVals != + reinterpret_cast_no_cast_align_warning(&mMask)) { + // don't destroy old data: put it into the pool instead + if (forceFree) { + std::free(oldKeyVals); + } else { + DataPool::addOrFree(oldKeyVals, + calcNumBytesTotal(oldMaxElementsWithBuffer)); + } + } + } + } + + ROBIN_HOOD(NOINLINE) void throwOverflowError() const { +#if ROBIN_HOOD(HAS_EXCEPTIONS) + throw std::overflow_error("robin_hood::map overflow"); +#else + abort(); +#endif + } + + template + std::pair try_emplace_impl(OtherKey &&key, Args &&... args) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(args)...)); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(args)...)); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + break; + } + + return std::make_pair( + iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first), + InsertionState::key_found != idxAndState.second); + } + + template + std::pair insertOrAssignImpl(OtherKey &&key, Mapped &&obj) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + mKeyVals[idxAndState.first].getSecond() = std::forward(obj); + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(obj))); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(obj))); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + break; + } + + return std::make_pair( + iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first), + InsertionState::key_found != idxAndState.second); + } + + void initData(size_t max_elements) { + mNumElements = 0; + mMask = max_elements - 1; + mMaxNumElementsAllowed = calcMaxNumElementsAllowed(max_elements); + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(max_elements); + + // calloc also zeroes everything + auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer); + ROBIN_HOOD_LOG("std::calloc " << numBytesTotal << " = calcNumBytesTotal(" + << numElementsWithBuffer << ")") + mKeyVals = reinterpret_cast( + detail::assertNotNull(std::calloc(1, numBytesTotal))); + mInfo = reinterpret_cast(mKeyVals + numElementsWithBuffer); + + // set sentinel + mInfo[numElementsWithBuffer] = 1; + + mInfoInc = InitialInfoInc; + mInfoHashShift = InitialInfoHashShift; + } + + enum class InsertionState { + overflow_error, + key_found, + new_node, + overwrite_node + }; + + // Finds key, and if not already present prepares a spot where to pot the key + // & value. + // This potentially shifts nodes out of the way, updates mInfo and number of + // inserted + // elements, so the only operation left to do is create/assign a new node at + // that spot. + template + std::pair insertKeyPrepareEmptySpot(OtherKey &&key) { + for (int i = 0; i < 256; ++i) { + size_t idx{}; + InfoType info{}; + keyToIdx(key, &idx, &info); + nextWhileLess(&info, &idx); + + // while we potentially have a match + while (info == mInfo[idx]) { + if (WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) { + // key already exists, do NOT insert. + // see http://en.cppreference.com/w/cpp/container/unordered_map/insert + return std::make_pair(idx, InsertionState::key_found); + } + next(&info, &idx); + } + + // unlikely that this evaluates to true + if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) { + if (!increase_size()) { + return std::make_pair(size_t(0), InsertionState::overflow_error); + } + continue; + } + + // key not found, so we are now exactly where we want to insert it. + auto const insertion_idx = idx; + auto const insertion_info = info; + if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) { + mMaxNumElementsAllowed = 0; + } + + // find an empty spot + while (0 != mInfo[idx]) { + next(&info, &idx); + } + + if (idx != insertion_idx) { + shiftUp(idx, insertion_idx); + } + // put at empty spot + mInfo[insertion_idx] = static_cast(insertion_info); + ++mNumElements; + return std::make_pair( + insertion_idx, idx == insertion_idx ? InsertionState::new_node + : InsertionState::overwrite_node); + } + + // enough attempts failed, so finally give up. + return std::make_pair(size_t(0), InsertionState::overflow_error); + } + + bool try_increase_info() { + ROBIN_HOOD_LOG("mInfoInc=" << mInfoInc << ", numElements=" << mNumElements + << ", maxNumElementsAllowed=" + << calcMaxNumElementsAllowed(mMask + 1)) + if (mInfoInc <= 2) { + // need to be > 2 so that shift works (otherwise undefined behavior!) + return false; + } + // we got space left, try to make info smaller + mInfoInc = static_cast(mInfoInc >> 1U); + + // remove one bit of the hash, leaving more space for the distance info. + // This is extremely fast because we can operate on 8 bytes at once. + ++mInfoHashShift; + auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1); + + for (size_t i = 0; i < numElementsWithBuffer; i += 8) { + auto val = unaligned_load(mInfo + i); + val = (val >> 1U) & UINT64_C(0x7f7f7f7f7f7f7f7f); + std::memcpy(mInfo + i, &val, sizeof(val)); + } + // update sentinel, which might have been cleared out! + mInfo[numElementsWithBuffer] = 1; + + mMaxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1); + return true; + } + + // True if resize was possible, false otherwise + bool increase_size() { + // nothing allocated yet? just allocate InitialNumElements + if (0 == mMask) { + initData(InitialNumElements); + return true; + } + + auto const maxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1); + if (mNumElements < maxNumElementsAllowed && try_increase_info()) { + return true; + } + + ROBIN_HOOD_LOG("mNumElements=" + << mNumElements + << ", maxNumElementsAllowed=" << maxNumElementsAllowed + << ", load=" << (static_cast(mNumElements) * 100.0 / + (static_cast(mMask) + 1))) + + nextHashMultiplier(); + if (mNumElements * 2 < calcMaxNumElementsAllowed(mMask + 1)) { + // we have to resize, even though there would still be plenty of space + // left! + // Try to rehash instead. Delete freed memory so we don't steadyily + // increase mem in case + // we have to rehash a few times + rehashPowerOfTwo(mMask + 1, true); + } else { + // Each resize use a different hash so we don't so easily overflow. + // Make sure we only have odd numbers, so that the multiplication is + // reversible! + rehashPowerOfTwo((mMask + 1) * 2, false); + } + return true; + } + + void nextHashMultiplier() { + // adding an *even* number, so that the multiplier will always stay odd. + // This is necessary + // so that the hash stays a mixing function (and thus doesn't have any + // information loss). + mHashMultiplier += UINT64_C(0xc4ceb9fe1a85ec54); + } + + void destroy() { + if (0 == mMask) { + // don't deallocate! + return; + } + + Destroyer::value>{} + .nodesDoNotDeallocate(*this); + + // This protection against not deleting mMask shouldn't be needed as it's + // sufficiently + // protected with the 0==mMask check, but I have this anyways because g++ 7 + // otherwise + // reports a compile error: attempt to free a non-heap object 'fm' + // [-Werror=free-nonheap-object] + if (mKeyVals != reinterpret_cast_no_cast_align_warning(&mMask)) { + ROBIN_HOOD_LOG("std::free") + std::free(mKeyVals); + } + } + + void init() noexcept { + mKeyVals = reinterpret_cast_no_cast_align_warning(&mMask); + mInfo = reinterpret_cast(&mMask); + mNumElements = 0; + mMask = 0; + mMaxNumElementsAllowed = 0; + mInfoInc = InitialInfoInc; + mInfoHashShift = InitialInfoHashShift; + } + + // members are sorted so no padding occurs + uint64_t mHashMultiplier = UINT64_C(0xc4ceb9fe1a85ec53); // 8 byte 8 + Node *mKeyVals = + reinterpret_cast_no_cast_align_warning(&mMask); // 8 byte 16 + uint8_t *mInfo = reinterpret_cast(&mMask); // 8 byte 24 + size_t mNumElements = 0; // 8 byte 32 + size_t mMask = 0; // 8 byte 40 + size_t mMaxNumElementsAllowed = 0; // 8 byte 48 + InfoType mInfoInc = InitialInfoInc; // 4 byte 52 + InfoType mInfoHashShift = InitialInfoHashShift; // 4 byte 56 + // 16 byte 56 if NodeAllocator +}; + +} // namespace detail + +// map + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_flat_map = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_node_map = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_map = detail::Table< + sizeof(robin_hood::pair) <= sizeof(size_t) * 6 && + std::is_nothrow_move_constructible>::value && + std::is_nothrow_move_assignable>::value, + MaxLoadFactor100, Key, T, Hash, KeyEqual>; + +// set + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_flat_set = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_node_set = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_set = + detail::Table::value && + std::is_nothrow_move_assignable::value, + MaxLoadFactor100, Key, void, Hash, KeyEqual>; + +} // namespace robin_hood + +#endif diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h index 52606b2a7f59e..fa91490e6cd8a 100644 --- a/paddle/fluid/extension/include/ext_tensor.h +++ b/paddle/fluid/extension/include/ext_tensor.h @@ -113,6 +113,9 @@ class PD_DLL_DECL Tensor { /// \brief Cast datatype from one to another Tensor cast(const DataType& target_type) const; + /// \brief Check Tensor is initialized + bool is_initialized() const; + #ifdef PADDLE_WITH_CUDA /// \bref Get current stream of Tensor cudaStream_t stream() const; diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc index e9705e2101cc3..8b2f7cc5bf13c 100644 --- a/paddle/fluid/extension/src/ext_tensor.cc +++ b/paddle/fluid/extension/src/ext_tensor.cc @@ -103,15 +103,6 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc, void Tensor::reshape(const std::vector &shape) { GET_CASTED_TENSOR auto new_dim = framework::make_ddim(shape); - if (tensor->numel() != framework::product(new_dim)) { - LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger " - "or smaller" - << "than original shape will not change your tensor's memory " - "Please call" - << "paddle::Tensor::mutable_data() after to reallocate " - "your tensor's size." - << std::endl; - } tensor->Resize(new_dim); } @@ -393,6 +384,15 @@ int64_t Tensor::size() const { return tensor->numel(); } +bool Tensor::is_initialized() const { + GET_CASTED_TENSOR; + if (tensor->IsInitialized()) { + return true; + } else { + return false; + } +} + #ifdef PADDLE_WITH_CUDA cudaStream_t Tensor::stream() const { if (!stream_.IsStreamSet()) { diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 13c37b93d7c98..24bed27728083 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -191,13 +191,15 @@ if(WITH_PYTHON) py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto) #Generate an empty \ #__init__.py to make framework_py_proto as a valid python module. + add_custom_target(fleet_proto_init ALL + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py + ) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) - add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto) + add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init) if (NOT WIN32) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto - COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMENT "Copy generated python proto into directory paddle/fluid/proto." @@ -207,8 +209,6 @@ if(WITH_PYTHON) string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/") add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto - COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py COMMAND copy /Y *.py ${proto_dstpath} COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} COMMENT "Copy generated python proto into directory paddle/fluid/proto." @@ -217,6 +217,12 @@ if(WITH_PYTHON) endif(NOT WIN32) endif() +if (WITH_PSCORE) + add_custom_target(index_dataset_proto_init ALL DEPENDS fleet_proto_init index_dataset_py_proto + COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/index_dataset/index_dataset_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.") +endif(WITH_PSCORE) + cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc index 8d6fd4efd5ae3..a65dcbd55f946 100644 --- a/paddle/fluid/framework/custom_tensor_test.cc +++ b/paddle/fluid/framework/custom_tensor_test.cc @@ -220,6 +220,21 @@ void GroupTestDtypeConvert() { paddle::DataType::FLOAT16); } +void TestInitilized() { + paddle::Tensor test_tensor(paddle::PlaceType::kCPU); + CHECK(test_tensor.is_initialized() == false); + test_tensor.reshape({1, 1}); + test_tensor.mutable_data(); + CHECK(test_tensor.is_initialized() == true); + float* tensor_data = test_tensor.data(); + for (int i = 0; i < test_tensor.size(); i++) { + tensor_data[i] = 0.5; + } + for (int i = 0; i < test_tensor.size(); i++) { + CHECK(tensor_data[i] == 0.5); + } +} + TEST(CustomTensor, copyTest) { VLOG(2) << "TestCopy"; GroupTestCopy(); @@ -233,4 +248,6 @@ TEST(CustomTensor, copyTest) { GroupTestCast(); VLOG(2) << "TestDtypeConvert"; GroupTestDtypeConvert(); + VLOG(2) << "TestInitilized"; + TestInitilized(); } diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 103dd0c5ae599..0fdb97db20af9 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -354,8 +354,36 @@ void CheckVarHasNanOrInf(const std::string& op_type, var_name)); #endif return; - } + } else if (platform::is_npu_place(tensor->place())) { +#ifdef PADDLE_WITH_ASCEND_CL + if (tensor->type() != proto::VarType::FP32) { + return; + } + + framework::LoDTensor cpu_tensor; + cpu_tensor.Resize(tensor->dims()); + float* cpu_data = static_cast( + cpu_tensor.mutable_data(platform::CPUPlace(), tensor->type())); + framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); + bool flag = false; + for (int i = 0; i < cpu_tensor.numel(); i++) { + if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { + flag = true; + break; + } + } + PADDLE_ENFORCE_NE( + flag, true, + platform::errors::Fatal("Operator %s output Tensor %s contains Inf.", + op_type, var_name)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Tensor[%s] use npu place. PaddlePaddle must compile with NPU.", + var_name)); +#endif + return; + } tensor_check(op_type, var_name, *tensor, place); } diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 9ced4221e1dd6..a49e492e48028 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -562,7 +562,6 @@ class PSGPUWorker : public HogwildWorker { void ResetStat(); protected: - std::shared_ptr fleet_ptr_; void PushGradients(); void DumpParam(); void CopySparseTable(); @@ -639,7 +638,8 @@ class PSGPUWorker : public HogwildWorker { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(WITH_ASCEND_CL) class SectionWorker : public DeviceWorker { public: SectionWorker() {} diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index a539a5d5f96b5..5780a95343385 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -79,7 +79,8 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker); REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(WITH_ASCEND_CL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto old mode 100755 new mode 100644 index 6363eedc80a20..654b88920acaf --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -43,6 +43,12 @@ message ShardingConfig { optional int32 pp_degree = 11 [ default = 1 ]; } +message HybridConfig { + optional int32 dp_degree = 1 [ default = -1 ]; + optional int32 mp_degree = 2 [ default = 1 ]; + optional int32 pp_degree = 3 [ default = 1 ]; +} + message AMPConfig { optional float init_loss_scaling = 1 [ default = 32768.0 ]; optional int32 incr_every_n_steps = 2 [ default = 1000 ]; @@ -124,6 +130,7 @@ message AsyncConfig { optional bool launch_barrier = 9 [ default = true ]; optional string heter_worker_device_guard = 10 [ default = 'cpu' ]; optional int32 lr_decay_steps = 11 [ default = 10 ]; + optional int32 use_ps_gpu = 12 [ default = 0 ]; } message PipelineConfig { @@ -132,6 +139,10 @@ message PipelineConfig { optional string schedule_mode = 3 [ default = '1F1B' ]; } +message TensorParallelConfig { + optional int32 tensor_parallel_degree = 1 [ default = 1 ]; +} + message DistributedStrategy { // bool options optional Mode mode = 1 [ default = COLLECTIVE ]; @@ -162,6 +173,7 @@ message DistributedStrategy { optional bool sharding = 26 [ default = false ]; optional float last_comm_group_size_MB = 27 [ default = 1 ]; optional bool find_unused_parameters = 28 [ default = true ]; + optional bool tensor_parallel = 29 [ default = false ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; @@ -174,6 +186,8 @@ message DistributedStrategy { optional LambConfig lamb_configs = 109; optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110; optional ShardingConfig sharding_configs = 111; + optional HybridConfig hybrid_configs = 112; + optional TensorParallelConfig tensor_parallel_configs = 113; optional BuildStrategy build_strategy = 201; optional ExecutionStrategy execution_strategy = 202; } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 101991d2c1ba0..e5bfbf4a8f779 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -456,11 +456,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #endif } else if (platform::is_npu_place(place_)) { #ifdef PADDLE_WITH_ASCEND_CL - // TODO(ascendrc): Support garbage collector on NPUPlace - VLOG(4) << "Skip NPU gc because it is not implemented now."; + if (IsFastEagerDeletionModeEnabled()) { + VLOG(4) << "Use unsafe fast gc for NPU."; + gc.reset(new NPUUnsafeFastGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Please set FLAGS_fast_eager_deletion_mode=true to use " + "GarbageCollector on NPU.")); + // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. + VLOG(4) << "Use default stream gc for NPU."; + gc.reset(new NPUDefaultStreamGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + } #else - PADDLE_THROW(platform::errors::Unimplemented( - "No NPU gc found in CPU/GPU/XPU paddle")); + PADDLE_THROW( + platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle")); #endif } } diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index ce0a905afc628..03dd2cff655c0 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -1,5 +1,10 @@ if(WITH_PSLIB) cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib) +else() + cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) +endif(WITH_PSLIB) + +if(WITH_HETERPS) if(WITH_NCCL) nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc DEPS heter_ps) @@ -8,13 +13,10 @@ if(WITH_PSLIB) hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc DEPS heter_ps) add_subdirectory(heter_ps) - else() - cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc) endif(WITH_NCCL) else() - cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc) -endif(WITH_PSLIB) +endif(WITH_HETERPS) if(WITH_NCCL OR WITH_RCCL) cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope) @@ -41,6 +43,6 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph) -endif(WITH_ASCEND) +endif() diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.cc b/paddle/fluid/framework/fleet/ascend_wrapper.cc index d1b2f51f70036..273939f6bee61 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.cc +++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/framework/fleet/ascend_wrapper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h index baa2fd126a4b7..f749ee8cfa0ba 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.h +++ b/paddle/fluid/framework/fleet/ascend_wrapper.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL #include #include @@ -29,7 +29,6 @@ limitations under the License. */ #include "paddle/fluid/platform/timer.h" #include "ge/ge_api.h" -#include "ge/ge_api_types.h" #include "graph/attr_value.h" #include "graph/tensor.h" #include "graph/types.h" diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index e584fb5e2b9ca..613b2803637d2 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -34,6 +34,9 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/platform/type_defs.h" +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index a02931b3f5c28..1fb2f0fab4aff 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -14,15 +14,21 @@ limitations under the License. */ #pragma once -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ - (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_HETERPS #include #include #include #include +#ifdef PADDLE_WITH_PSLIB #include "common_value.h" // NOLINT +#endif + +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" +#endif + #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/scope.h" @@ -39,7 +45,12 @@ class HeterContext { } Scope* scope_{nullptr}; std::vector> feature_keys_; +#ifdef PADDLE_WITH_PSLIB std::vector> value_ptr_; +#endif +#ifdef PADDLE_WITH_PSCORE + std::vector> value_ptr_; +#endif std::vector> device_values_; std::vector> device_keys_; std::vector mutex_; @@ -66,6 +77,21 @@ class HeterContext { mutex_[i] = new std::mutex(); } } + + void Reset() { + for (size_t i = 0; i < feature_keys_.size(); ++i) { + feature_keys_[i].clear(); + } + for (size_t i = 0; i < value_ptr_.size(); ++i) { + value_ptr_[i].clear(); + } + for (size_t i = 0; i < device_values_.size(); ++i) { + device_values_[i].clear(); + } + for (size_t i = 0; i < device_keys_.size(); ++i) { + device_keys_[i].clear(); + } + } void batch_add_keys( const std::vector>& thread_keys) { assert(thread_keys.size() == feature_keys_.size()); @@ -79,6 +105,15 @@ class HeterContext { } } + void batch_add_keys(int shard_num, + const std::unordered_set& shard_keys) { + int idx = feature_keys_[shard_num].size(); + feature_keys_[shard_num].resize(feature_keys_[shard_num].size() + + shard_keys.size()); + std::copy(shard_keys.begin(), shard_keys.end(), + feature_keys_[shard_num].begin() + idx); + } + void UniqueKeys() { std::vector threads; auto unique_func = [this](int i) { diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index 698ece09de6c5..c3bf33b32c2da 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS #include diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index e5c0972763bed..089130f6da8c7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -17,11 +17,17 @@ limitations under the License. */ #include #include #include +#ifdef PADDLE_WTIH_PSLIB #include "common_value.h" // NOLINT +#endif +#ifdef PADDLE_WITH_PSCORE +#endif #include "thrust/pair.h" //#include "cudf/concurrent_unordered_map.cuh.h" #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" +#include "paddle/fluid/platform/type_defs.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index 871f9c7857af4..098c795fc7e1f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { @@ -119,6 +119,7 @@ void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { continue; } ValType& gpu_val = kv[i].second; +#ifdef PADDLE_WITH_PSLIB auto* downpour_value = (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); int downpour_value_size = downpour_value->size(); @@ -138,6 +139,14 @@ void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { cpu_val[x + 7] = gpu_val.mf[x]; } } +#endif +#ifdef PADDLE_WITH_PSCORE + auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); + downpour_value->count_ = gpu_val.show; + for (int x = 0; x < gpu_val.mf_size; x++) { + downpour_value->data_[x] = gpu_val.mf[x]; + } +#endif } container_->prefetch(devid, stream); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 0e38ebbd7f4e7..2ec2a8a1f1e22 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "thrust/pair.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { @@ -182,7 +182,7 @@ class HeterComm { std::vector> path_; std::vector storage_; int feanum_{1800 * 2048}; - int multi_node_{1}; + int multi_node_{0}; std::vector nccl_inner_comms_; std::vector nccl_inter_comms_; int node_size_; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 2f1c809c01eaa..1b4205e3c38fe 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_HETERPS #include -#ifdef PADDLE_WITH_PSLIB namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index f2e129ded9fef..581b0d511c23e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { @@ -54,8 +54,8 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } void HeterPs::push_sparse(int num, FeatureKey* d_keys, FeaturePushValue* d_grads, size_t len) { - // comm_->push_sparse(num, d_keys, d_grads, len, opt_); - comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); + comm_->push_sparse(num, d_keys, d_grads, len, opt_); + // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); } void HeterPs::set_nccl_comm_and_size(const std::vector& inner_comms, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 142f4a93b93a2..d78b6b492074d 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h" #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index 7980220eab9b9..05b3ecf9c3c12 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc index f65b664f83ba0..0f2af2a522e28 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS #include "heter_resource.h" #include "paddle/fluid/platform/cuda_device_guard.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h index ad7649a8a33cb..7b23379994c73 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/enforce.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index b3ec9e752e62b..7e82a8e014fd3 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "optimizer_conf.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 4274876c9975e..67ff6b6acaefb 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -26,8 +26,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ - (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_HETERPS #include #include @@ -58,7 +57,12 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, auto& device_mutex = gpu_task->mutex_; std::vector threads; +#ifdef PADDLE_WITH_PSLIB auto fleet_ptr = FleetWrapper::GetInstance(); +#endif +#ifdef PADDLE_WITH_PSCORE + auto fleet_ptr = paddle::distributed::Communicator::GetInstance(); +#endif // data should be in input channel thread_keys_.resize(thread_keys_thread_num_); @@ -99,12 +103,26 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, timeline.Start(); + threads.clear(); // merge thread_keys to shard_keys - for (size_t i = 0; i < thread_keys_.size(); i++) { - gpu_task->batch_add_keys(thread_keys_[i]); - for (int j = 0; j < thread_keys_thread_num_; j++) { - thread_keys_[i][j].clear(); + auto merge_ins_func = [this, gpu_task](int shard_num) { + for (int i = 0; i < thread_keys_thread_num_; ++i) { + gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]); + thread_keys_[i][shard_num].clear(); } + }; + + // for (size_t i = 0; i < thread_keys_.size(); i++) { + // gpu_task->batch_add_keys(thread_keys_[i]); + // for (int j = 0; j < thread_keys_thread_num_; j++) { + // thread_keys_[i][j].clear(); + // } + //} + for (int i = 0; i < thread_keys_shard_num_; ++i) { + threads.push_back(std::thread(merge_ins_func, i)); + } + for (auto& t : threads) { + t.join(); } timeline.Pause(); @@ -124,9 +142,16 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, auto ptl_func = [this, &local_keys, &local_ptr, &table_id, &fleet_ptr](int i) { size_t key_size = local_keys[i].size(); +#ifdef PADDLE_WITH_PSLIB auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( reinterpret_cast(local_ptr[i].data()), table_id, local_keys[i].data(), key_size); +#endif +#ifdef PADDLE_WITH_PSCORE + auto tt = fleet_ptr->_worker_ptr->pull_sparse_ptr( + reinterpret_cast(local_ptr[i].data()), table_id, + local_keys[i].data(), key_size); +#endif tt.wait(); auto status = tt.get(); // auto status = 0; @@ -153,8 +178,14 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, auto build_func = [device_num, &local_keys, &local_ptr, &device_keys, &device_vals, &device_mutex](int i) { std::vector> task_keys(device_num); +#ifdef PADDLE_WITH_PSLIB std::vector> task_ptrs( device_num); +#endif + +#ifdef PADDLE_WITH_PSCORE + std::vector> task_ptrs(device_num); +#endif for (size_t j = 0; j < local_keys[i].size(); j++) { int shard = local_keys[i][j] % device_num; @@ -169,7 +200,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, int cur = device_keys[dev].size(); device_keys[dev].resize(device_keys[dev].size() + len); device_vals[dev].resize(device_vals[dev].size() + len); - +#ifdef PADDLE_WITH_PSLIB for (int j = 0; j < len; ++j) { device_keys[dev][cur + j] = task_keys[dev][j]; float* ptr_val = task_ptrs[dev][j]->data(); @@ -196,6 +227,35 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, } } } +#endif +#ifdef PADDLE_WITH_PSCORE + for (int j = 0; j < len; ++j) { + device_keys[dev][cur + j] = task_keys[dev][j]; + distributed::VALUE* ptr_val = task_ptrs[dev][j]; + FeatureValue& val = device_vals[dev][cur + j]; + bool has_mf = 1; + val.delta_score = 0; + val.show = ptr_val->count_; + val.clk = 0; + val.slot = 0; + val.lr = 0; + val.lr_g2sum = 0; + val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); + + if (has_mf) { + val.mf_size = MF_DIM + 1; + for (int x = 0; x < val.mf_size; x++) { + val.mf[x] = ptr_val->data_[x]; + } + } else { + val.mf_size = 0; + for (int x = 0; x < MF_DIM + 1; x++) { + val.mf[x] = 0; + } + } + } +#endif + VLOG(1) << "GpuPs build hbmps done"; device_mutex[dev]->unlock(); } @@ -215,6 +275,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { int device_num = heter_devices_.size(); std::shared_ptr gpu_task = gpu_task_pool_.Get(); + gpu_task->Reset(); BuildTask(gpu_task, table_id, feature_dim); platform::Timer timeline; timeline.Start(); @@ -227,8 +288,8 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { size_max = std::max(size_max, feature_keys_count[i]); } if (HeterPs_) { - HeterPs_->show_one_table(0); - return; + delete HeterPs_; + HeterPs_ = nullptr; } std::vector threads(device_num); HeterPs_ = HeterPsBase::get_instance(size_max, resource_); @@ -249,6 +310,7 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { timeline.Pause(); VLOG(1) << "GpuPs build table total costs: " << timeline.ElapsedSec() << " s."; + gpu_task_pool_.Push(gpu_task); } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 2eedcd5f1c700..2bf564d3f76d5 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS #include #include #include diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index ef586b41fe05d..cfb23d1be2acf 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -14,8 +14,7 @@ limitations under the License. */ #pragma once -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ - (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_HETERPS #include #include @@ -26,7 +25,6 @@ limitations under the License. */ #include #include #include - #ifdef PADDLE_WITH_GLOO #include #include "paddle/fluid/framework/fleet/gloo_wrapper.h" @@ -42,6 +40,9 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/service/communicator.h" +#endif namespace paddle { namespace framework { @@ -219,7 +220,7 @@ class PSGPUWrapper { std::shared_ptr resource_; int32_t sleep_seconds_before_fail_exit_; std::vector slot_vector_; - int multi_node_{1}; + int multi_node_{0}; int node_size_; std::vector inner_comms_; std::vector inter_comms_; diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 8dfbd3c268b86..9ab6b5d8c178b 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -122,6 +122,32 @@ void CUDAPinnedGarbageCollector::ClearCallback( } #endif +#ifdef PADDLE_WITH_ASCEND_CL +NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector( + const platform::NPUPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void NPUDefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void NPUDefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} +NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector( + const platform::NPUPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void NPUUnsafeFastGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +#endif + int64_t GetEagerDeletionThreshold() { return FLAGS_eager_delete_tensor_gb < 0 ? -1 diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 572c79d21a045..2c2b57bbe420a 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -131,6 +131,28 @@ class CUDAPinnedGarbageCollector : public GarbageCollector { }; #endif +#ifdef PADDLE_WITH_ASCEND_CL +class NPUDefaultStreamGarbageCollector : public GarbageCollector { + public: + NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place, + size_t max_memory_size); + + void Wait() const override; + + protected: + void ClearCallback(const std::function &callback) override; +}; + +class NPUUnsafeFastGarbageCollector : public GarbageCollector { + public: + NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place, + size_t max_memory_size); + + protected: + void ClearCallback(const std::function &callback) override; +}; +#endif + template void GarbageCollector::Add(Container &&objs) { Add(std::forward(objs), []() {}); diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h index 8f52235c96244..3f65eaf3aa121 100644 --- a/paddle/fluid/framework/heter_service.h +++ b/paddle/fluid/framework/heter_service.h @@ -30,10 +30,12 @@ limitations under the License. */ #include "brpc/controller.h" #include "brpc/server.h" #include "paddle/fluid/platform/timer.h" +#endif namespace paddle { namespace framework { +#ifdef PADDLE_WITH_PSLIB typedef std::function HeterServiceHandler; class DataFeed; @@ -142,7 +144,7 @@ class HeterTask { double cpu_2_gpu_time{0}; platform::Timer timeline; }; - +#endif template class HeterObjectPool { public: @@ -153,7 +155,7 @@ class HeterObjectPool { if (pool_.empty()) { num_ += 1; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - VLOG(0) << "pool construct size: " << num_; + VLOG(3) << "pool construct size: " << num_; #endif return std::make_shared(); } else { @@ -178,6 +180,7 @@ class HeterObjectPool { int num_{0}; }; +#ifdef PADDLE_WITH_PSLIB struct BthreadMutextGuard { BthreadMutextGuard(bthread_mutex_t* rho) { mutex_ = rho; @@ -258,7 +261,6 @@ class HeterList { std::unique_lock lock(mutex_); cond_.wait(lock, [this] { return size < cap_; }); if (task_map_.find(key) != task_map_.end()) { - // std::cout << "try put key=" << key << " false" << std::endl; task_map_.erase(key); return false; } else { @@ -267,7 +269,6 @@ class HeterList { node->value = value; map_[node->key] = node; attach(node); - // std::cout << "try put key=" << key << " true" << std::endl; return true; } } @@ -276,7 +277,6 @@ class HeterList { std::unique_lock lock(mutex_); cond_.wait(lock, [this] { return size < cap_; }); HeterNode* node = new HeterNode; - // std::cout << "put key=" << key << " true" << std::endl; node->key = key; node->value = value; map_[node->key] = node; @@ -288,7 +288,6 @@ class HeterList { std::lock_guard lock(mutex_); auto iter = map_.find(key); if (iter != map_.end()) { - // std::cout << "try get key=" << key << " true" << std::endl; HeterNode* node = iter->second; detach(node); cond_.notify_one(); @@ -298,7 +297,6 @@ class HeterList { return ret; } task_map_.insert(key); - // std::cout << "try get key=" << key << " false" << std::endl; return nullptr; } @@ -306,7 +304,6 @@ class HeterList { std::lock_guard lock(mutex_); auto iter = map_.find(key); if (iter != map_.end()) { - // std::cout << "get key=" << key << " true" << std::endl; HeterNode* node = iter->second; detach(node); cond_.notify_one(); @@ -315,7 +312,6 @@ class HeterList { delete node; return ret; } - // std::cout << "get key=" << key << " false" << std::endl; return nullptr; } @@ -323,14 +319,12 @@ class HeterList { std::lock_guard lock(mutex_); HeterNode* node = head_->next; if (node == tail_) { - // std::cout << "get2 false" << std::endl; return nullptr; } else { detach(node); cond_.notify_one(); T ret = std::move(node->value); map_.erase(node->key); - // std::cout << "get2 key=" << node->key << " true" << std::endl; delete node; return ret; } @@ -371,7 +365,7 @@ class HeterList { int cap_; int size; }; +#endif } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc index 44069f61d93ff..59d071e103459 100644 --- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc @@ -299,6 +299,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()}); new_op_desc.SetAttr("epsilon", end_patter_layernorms[k]->Op()->GetAttr("epsilon")); + + if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) { + new_op_desc.SetAttr("enable_int8", true); + } + auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc); for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) { diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 0a70440765d44..25bf03f426a1d 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const { "hard_shrink", "hard_sigmoid", "relu6", "soft_relu", "swish", "thresholded_relu", "log", "square", "softplus", - "softsign"}; + "softsign", "silu"}; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc index 06df1caca35b9..4eb532b47cb4b 100644 --- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc @@ -43,8 +43,9 @@ void InterpolateMKLDNNPass::ApplyImpl(ir::Graph* graph) const { int found_count = 0; const std::vector interpolate_op_types = { - "bilinear_interp", "nearest_interp", "trilinear_interp", "bicubic_interp", - "linear_interp"}; + "bilinear_interp", "nearest_interp", "trilinear_interp", + "bicubic_interp", "linear_interp", "bilinear_interp_v2", + "nearest_interp_v2"}; for (const Node* node : graph->Nodes()) { if (node->IsOp() && diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index e8f4dbd29543c..1e8349e878781 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -535,6 +535,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope, multihead_op_desc.SetAttr("alpha", scale_attr); multihead_op_desc.SetAttr("head_number", head_number); + auto* mul0_op_desc = mul0->Op(); + auto* mul1_op_desc = mul1->Op(); + auto* mul2_op_desc = mul2->Op(); + if (mul0_op_desc->HasAttr("enable_int8")) { + multihead_op_desc.SetAttr("enable_int8", + mul0_op_desc->GetAttr("enable_int8")); + // all mul op has same input. + multihead_op_desc.SetAttr("Input_scale", + mul0_op_desc->GetAttr("X_scale")); + auto weight_scale0 = BOOST_GET_CONST( + std::vector, mul0_op_desc->GetAttr("weight_scale")); + auto weight_scale1 = BOOST_GET_CONST( + std::vector, mul1_op_desc->GetAttr("weight_scale")); + auto weight_scale2 = BOOST_GET_CONST( + std::vector, mul2_op_desc->GetAttr("weight_scale")); + auto weight_max = std::max(weight_scale0, weight_scale1); + weight_max = std::max(weight_max, weight_scale2); + multihead_op_desc.SetAttr("weight_scale", weight_max); + + if (mul0_op_desc->HasAttr("out_threshold")) { + auto out_scale0 = + BOOST_GET_CONST(float, mul0_op_desc->GetAttr("out_threshold")); + auto out_scale1 = + BOOST_GET_CONST(float, mul1_op_desc->GetAttr("out_threshold")); + auto out_scale2 = + BOOST_GET_CONST(float, mul2_op_desc->GetAttr("out_threshold")); + auto out_scale_max = std::max(out_scale0, out_scale1); + out_scale_max = std::max(out_scale_max, out_scale2); + multihead_op_desc.SetAttr("out_threshold", out_scale_max); + } + } + auto* multihead = graph->CreateOpNode(&multihead_op_desc); IR_NODE_LINK_TO(input0, multihead); diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc index fd604ffe7b5de..35ba920060779 100644 --- a/paddle/fluid/framework/ir/placement_pass_base.cc +++ b/paddle/fluid/framework/ir/placement_pass_base.cc @@ -77,7 +77,8 @@ bool PlacementPassBase::IsDefaultOpTypes(const std::string& op_type) const { // the corresponding pass. const std::vector not_default_op_types = { "bilinear_interp", "nearest_interp", "trilinear_interp", - "bicubic_interp", "linear_interp"}; + "bicubic_interp", "linear_interp", "bilinear_interp_v2", + "linear_interp_v2"}; bool is_interpolate_op = std::find(not_default_op_types.begin(), not_default_op_types.end(), op_type) != not_default_op_types.end(); diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 5043fce8885cd..2fc39fd25d56c 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -225,10 +225,13 @@ void FuseDequant(ir::Graph* graph, Scope* scope, quantized_op_type == "depthwise_conv2d") { PADDLE_ENFORCE_EQ( dequant_type, "fake_channel_wise_dequantize_max_abs", - platform::errors::InvalidArgument("conv2d op must be dequantized by " - "[fake_channel_wise_dequantize_max_" - "abs], but got %s", - dequant_type)); + platform::errors::InvalidArgument( + "conv2d op must be dequantized by " + "[fake_channel_wise_dequantize_max_abs], but got %s. " + "If you uses PaddleSlim to generate the quantized " + "model, please set the 'weight_quantize_type' params as " + "'channel_wise_abs_max' and generate the quantized model again.", + dequant_type)); PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[0]), platform::errors::InvalidArgument( diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc index 0e63320f2f7ad..232e1d8da4ded 100644 --- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc @@ -153,6 +153,10 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { new_desc.SetInput("Scale", {layer_norm_scale->Name()}); new_desc.SetInput("Bias", {layer_norm_bias->Name()}); + if (elementwise->Op()->HasAttr("out_threshold")) { + new_desc.SetAttr("enable_int8", true); + } + // outputs new_desc.SetOutput("Out", {layer_norm_out->Name()}); diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 3a79452e230ef..0a6b5e44452fe 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -268,6 +268,21 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, TensorToStream(os, static_cast(tensor), dev_ctx); } +void SerializeToStream(std::ostream &os, const LoDTensor &tensor) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext *dev_ctx; + auto place = tensor.place(); + dev_ctx = pool.Get(place); + SerializeToStream(os, tensor, *dev_ctx); +} + +void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext *dev_ctx; + dev_ctx = pool.Get(platform::CPUPlace()); + DeserializeFromStream(os, tensor, *dev_ctx); +} + void DeserializeFromStream(std::istream &is, LoDTensor *tensor, const platform::DeviceContext &dev_ctx, const size_t &seek, diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index b8911154e6bf7..6b357aba1c5f9 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -14,16 +14,11 @@ limitations under the License. */ #pragma once +#include #include #include #include #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#include -#endif - -#include #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/mixed_vector.h" @@ -260,5 +255,9 @@ LoD ConvertToLengthBasedLoD(const LoD& offset_lod); LoD ConvertToOffsetBasedLoD(const LoD& length_lod); +void SerializeToStream(std::ostream& os, const LoDTensor& tensor); + +void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index ff8e71b92e0ac..198bb65863bb6 100644 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -38,6 +38,13 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, need_merge_var_names_.push_back( trainer_desc.downpour_param().stat_var_names(i)); } +#ifdef PADDLE_WITH_HETERPS + for (int i = 0; i < thread_num_; ++i) { + int num = trainer_desc.worker_places(i); + platform::CUDAPlace place = platform::CUDAPlace(num); + places_.push_back(place); + } +#endif // get filelist from trainer_desc here const std::vector readers = dataset->GetReaders(); @@ -102,13 +109,42 @@ void MultiTrainer::InitDumpEnv() { void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, const platform::Place& place) { for (int i = 0; i < thread_num_; ++i) { +#ifdef PADDLE_WITH_HETERPS + workers_[i]->SetPlace(places_[i]); + workers_[i]->SetReaderPlace(places_[i]); +#else workers_[i]->SetPlace(place); workers_[i]->SetReaderPlace(place); +#endif workers_[i]->SetRootScope(root_scope_); workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->BindingDataFeedMemory(); workers_[i]->CacheProgram(main_program); } +#ifdef PADDLE_WITH_HETERPS + for (int num = 0; num < thread_num_; ++num) { + auto place = places_[num]; + Scope* scope = workers_[num]->GetThreadScope(); + auto& block = main_program.Block(0); + for (auto& var : block.AllVars()) { + if (var->Persistable()) { + auto name = var->Name(); + Variable* root_var = root_scope_->FindVar(name); + if (!root_var) { + continue; + } + if (root_var->IsType()) { + continue; + } + LoDTensor* root_tensor = root_var->GetMutable(); + auto* ptr = scope->Var(name); + InitializeVariable(ptr, proto::VarType::LOD_TENSOR); + LoDTensor* thread_tensor = ptr->GetMutable(); + TensorCopy(*root_tensor, place, thread_tensor); + } + } + } +#endif } void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) { @@ -138,10 +174,77 @@ void MultiTrainer::Run() { } } +#ifdef PADDLE_WITH_HETERPS +void MultiTrainer::MergeDenseParam() { + auto communicator = paddle::distributed::Communicator::GetInstance(); + auto& recv_ctx = communicator->GetRecvCtxMap(); + Scope* thread_scope = workers_[0]->GetThreadScope(); + for (auto& iter : recv_ctx) { + auto& varnames = iter.second; + for (auto& name : varnames) { + Variable* root_var = root_scope_->FindVar(name); + LoDTensor* root_tensor = root_var->GetMutable(); + Variable* var = thread_scope->FindVar(name); + LoDTensor* tensor = var->GetMutable(); + TensorCopy((*tensor), root_tensor->place(), root_tensor); + } + } +} +#endif + +template +void MultiTrainer::MergeToRootScope(LoDTensor* root_tensor, LoDTensor* tensor) { + LoDTensor tmp_root; + TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root); + T* tmp_root_data = tmp_root.data(); + LoDTensor tmp_tensor; + TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor); + T* data = tmp_tensor.data(); + for (int i = 0; i < tmp_tensor.numel(); i++) { + tmp_root_data[i] += data[i]; + } + TensorCopy(tmp_root, platform::CPUPlace(), root_tensor); +} + void MultiTrainer::Finalize() { if (need_dump_field_ || need_dump_param_) { FinalizeDumpEnv(); } +#ifdef PADDLE_WITH_HETERPS + for (size_t i = 0; i < need_merge_var_names_.size(); i++) { + Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]); + if (root_var == nullptr) { + continue; + } + LoDTensor* root_tensor = root_var->GetMutable(); + + for (size_t j = 0; j < places_.size(); j++) { + Scope* cur_thread_scope = workers_[j]->GetThreadScope(); + Variable* thread_var = + cur_thread_scope->FindVar(need_merge_var_names_[i]); + if (thread_var == nullptr) { + continue; + } + LoDTensor* thread_tensor = thread_var->GetMutable(); +#define MergeCallback(cpp_type, proto_type) \ + do { \ + if (root_tensor->type() == proto_type) { \ + if (thread_tensor->type() != proto_type) { \ + VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \ + << "] " << need_merge_var_names_[i] \ + << ", root tensor type=" << root_tensor->type() \ + << ", thread tensor type=" << thread_tensor->type(); \ + exit(-1); \ + } \ + MergeToRootScope(root_tensor, thread_tensor); \ + } \ + } while (0) + _ForEachDataType_(MergeCallback); + } + } + MergeDenseParam(); + +#endif root_scope_->DropKids(); } diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 4c52932976122..818da7478b239 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -343,6 +343,12 @@ struct OpKernelRegistrarFunctorEx &places, InitP2P(places); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); - member_->use_device_ = exec_strategy.use_device_; - member_->build_strategy_ = build_strategy; - member_->use_all_reduce_ = member_->build_strategy_.reduce_ == - BuildStrategy::ReduceStrategy::kAllReduce; - member_->nranks_ = build_strategy.num_trainers_ * places.size(); - if (!member_->use_all_reduce_ && member_->nranks_ == 1) { - LOG(INFO) << "If you set build_strategy.reduce with 'Reduce'," - "the number of places should be greater than 1."; - member_->build_strategy_.reduce_ = - BuildStrategy::ReduceStrategy::kAllReduce; - member_->use_all_reduce_ = true; - } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) - if (member_->IsUseCUDA(member_->use_device_)) { - PADDLE_ENFORCE_EQ( - places.size(), 1, - platform::errors::Unavailable("Windows can support Single GPU only.")); - } -#endif - -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) - if (member_->IsUseCUDA(member_->use_device_)) { - PADDLE_ENFORCE_EQ( - places.size(), 1, - platform::errors::PermissionDenied( - "Your machine has multiple cards, " - "but the WITH_NCCL option is not turned on during compilation, " - "and you cannot use multi-card training or prediction. " - "Please recompile and turn on the WITH_NCCL option.")); - } -#endif - - std::string device_name; - if (member_->use_device_ == p::kCPU) { - device_name = "CPU"; - } else if (member_->use_device_ == p::kCUDA) { - device_name = "CUDA"; - } else { - device_name = "XPU"; - } - - VLOG(1) << string::Sprintf( - "The Program will be executed on %s using ParallelExecutor, %lu " - "cards are used, so %lu programs are executed in parallel.", - device_name, places.size(), places.size()); - - // Step 1. Bcast the bcast_vars to devs. - // Create local scopes - if (local_scopes.empty()) { - member_->own_local_scope_ = true; - member_->local_scopes_.emplace_back(member_->global_scope_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(&scope->NewScope()); - } - } else { - member_->own_local_scope_ = false; - PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(), - platform::errors::PreconditionNotMet( - "member_->places_.size() = %d is not equal to " - "local_scopes.size() = %d", - member_->places_.size(), local_scopes.size())); - for (size_t i = 0; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); - } - } - - std::vector graphs; - if (member_->build_strategy_.async_mode_) { - PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false, - platform::errors::Unavailable( - "gpu mode does not support async_mode_ now!")); - graphs.push_back(graph); - for (size_t i = 1; i < places.size(); ++i) { - auto *tmp_graph = new ir::Graph(graph->OriginProgram()); - async_graphs_.emplace_back(tmp_graph); - graphs.push_back(tmp_graph); - } - } - - // FIXME(Yancey1989): parallel graph mode get better performance - // in GPU allreduce distributed training. Need an elegant way to - // choice the execution strategy. - member_->build_strategy_.enable_parallel_graph_ = - EnableParallelGraphExecution(*graph, exec_strategy, - member_->build_strategy_); - if (member_->build_strategy_.enable_parallel_graph_) { - LOG(INFO) << "The Executor would execute the graph by ParallelGraph " - "Execution which can get better performance," - << "you can force it off by env FLAGS_enable_parallel_graph=0"; - } + // Initialize necessary info of member_ with strategy. + InitExecutorPrivateMemberInfo(exec_strategy, build_strategy, places.size(), + *graph); - if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_); - - // Initialize device context's nccl comm, will be used by normal - // Operators like sync_batch_norm, and collective ops. - // NOTE: more than one ParallelExecutor with same place, the nccl comm will - // be rewrite and there will be some problem. - // NOTE: NCCL group-calls and non-group-calls can not use the same - // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use - // same communicators. - auto *nccl_ctxs = - member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_); - auto &pool = platform::DeviceContextPool::Instance(); - for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { - auto *dev_ctx = static_cast( - pool.Get(member_->places_[dev_id])); - auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); - dev_ctx->set_nccl_comm(nccl_ctx.comm()); - } -#else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with CUDA.")); -#endif - } - if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_XPU_BKCL) - member_->InitOrGetBKCLCommunicator(scope, member_->build_strategy_); + // Step 1. Create local scopes and Clone graph into multi device + CreateLocalScopes(scope, local_scopes, /*create_new*/ true); + std::vector graphs = CloneGraphToMultiDevices(graph); + PrepareNCCLCommunicator(scope); - auto *bkcl_ctxs = - member_->bkcl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_); - auto &pool = platform::DeviceContextPool::Instance(); - for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { - auto *dev_ctx = static_cast( - pool.Get(member_->places_[dev_id])); - auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); - dev_ctx->set_bkcl_context(bkcl_ctx.comm()); - } -#else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with XPU.")); -#endif - } // broadcast parameters from the 0th device to others: auto need_broadcast = [&]() -> bool { if (member_->build_strategy_.num_trainers_ > 1) { @@ -778,257 +651,75 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } return false; }; - // Bcast Parameters to all GPUs if (need_broadcast()) { BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_); } - // Startup Program has been run. All local scopes has correct parameters. - // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp - std::vector async_graphs(places.size()); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_, - member_->nccl_ctxs_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_, - member_->nccl_ctxs_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_, member_->nccl_ctxs_); - } -#elif defined(PADDLE_WITH_XPU_BKCL) - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_, - member_->bkcl_ctxs_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_, - member_->bkcl_ctxs_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_, member_->bkcl_ctxs_); - } -#else - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_); - } -#endif - + std::vector async_graphs = + CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name); graph = member_->ApplyMemoryOptimizePass(graph); - async_graphs[0] = graph; // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - - member_->is_persistable_.emplace(node->Var()->Name(), - node->Var()->Persistable()); - } - } - - if (graph->Has(details::kFusedVars)) { - auto &fused_vars = graph->Get(details::kFusedVars); - for (auto &fused_var : fused_vars) { - var_infos.emplace_back(); - var_infos.back() = fused_var.second; + CreateVariableInfos(&var_infos, graph); + std::unordered_map scope_map = + CreateLocalExecScopes(member_->local_scopes_, /*create_new*/ true); - member_->is_persistable_.emplace(fused_var.first, - fused_var.second.persistable_); - } - } + // Step 4. Create SSAGraph executor + std::vector final_graphs = + CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph); - std::unordered_map scope_map; - for (auto *scope : member_->local_scopes_) { - auto &local_exec_scope = scope->NewScope(); - member_->local_exec_scopes_.emplace_back(&local_exec_scope); - scope_map.emplace(scope, &local_exec_scope); + VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; + if (!member_->build_strategy_.async_mode_) { + member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + std::move(var_infos), member_->places_, std::move(member_->executor_))); } - PADDLE_ENFORCE_EQ( - member_->local_scopes_.size(), member_->local_exec_scopes_.size(), - platform::errors::PreconditionNotMet( - "member_->local_scopes_.size() = %d is not equal to " - "member_->local_exec_scopes_.size() = %d", - member_->local_scopes_.size(), member_->local_exec_scopes_.size())); + ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map); + SetReaderOpDeviceInfoOfGraphs(final_graphs); +} - std::vector final_graphs; +void ParallelExecutor::BCastParamsToDevices( + const std::vector &vars, int trainer_id) const { + VLOG(3) << "BCastParamsToDevices"; + // the initializing bcast, all vars would be bcast from device(0). + for (auto &var : vars) { + framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); + if (main_var == nullptr || !main_var->IsType()) { + continue; + } - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use AsyncSSAGraphExecutor"; - member_->executor_.reset(new details::AsyncSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, async_graphs)); - final_graphs = async_graphs; - } else if (member_->build_strategy_.enable_parallel_graph_) { - VLOG(3) << "use ParallelSSAGraphExecutor"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - // TODO(Yancey1989): Remove passing in the main_program when - // allreduce_seq_pass doesn't need it as the attr. - bool is_inference = details::IsDataParallelInferenceGraph(*graph); - bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); + auto &main_tensor = main_var->Get(); + if (!main_tensor.IsInitialized()) { + VLOG(3) << "one in var not inited, return!"; + continue; + } + auto &dims = main_tensor.dims(); + if (paddle::platform::is_gpu_place(main_tensor.place())) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + std::vector buffers; + buffers.reserve(member_->places_.size()); + size_t numel = main_tensor.numel(); + ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto place = member_->places_[i]; + void *buffer; - auto *pg_exe = new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, graph); - final_graphs = pg_exe->Graphs(); - member_->executor_.reset(pg_exe); - - if (is_inference && member_->places_.size() > 1) { - member_->inference_executor_ = pg_exe; - if (!has_drop_last_read_op) { - VLOG(5) << "Enable partial feed support in inference phase"; - pg_exe->EnablePartialFeedSupport(); - } - } -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Paddle should be compiled with CUDA for ParallelGraph Execution.")); -#endif - } else { - bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); - auto possible_inference_graphs = - details::TrySeparateToMultipleSingleDeviceGraphs(graph); - if (!possible_inference_graphs.empty()) { - VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase"; - auto *pg_exe = new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, std::move(possible_inference_graphs)); - if (!has_drop_last_read_op) { - VLOG(5) << "Enable partial feed support in inference phase"; - pg_exe->EnablePartialFeedSupport(); - } - final_graphs = pg_exe->Graphs(); - member_->executor_.reset(pg_exe); - member_->inference_executor_ = pg_exe; - } else { - LOG_IF(WARNING, details::HasKeepLastReadOp(*graph)) - << "drop_last=False for DataLoader is not supported in training " - "network. It is automatically turned to drop_last=True."; - if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - VLOG(3) << "use ThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, graph)); - } else { - if (member_->use_device_ == p::kXPU) { -#if defined(PADDLE_WITH_XPU) - VLOG(3) << "use BindThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::BindThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, - member_->local_exec_scopes_, member_->places_, graph)); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use XPU device since it's not compiled with XPU," - "Please recompile or reinstall Paddle with XPU support.")); -#endif - } else { - VLOG(3) << "use FastThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, - member_->local_exec_scopes_, member_->places_, graph)); - } - } - final_graphs.emplace_back(graph); - } - } - - VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; - if (!member_->build_strategy_.async_mode_) { - member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - std::move(var_infos), member_->places_, std::move(member_->executor_))); - } - - for (auto *g : final_graphs) { - auto ops = ir::FilterByNodeWrapper(*g); - for (auto *op : ops) { - op->SetLocalExecScopes(scope_map); - } - } - - if (final_graphs.size() == 1) { - ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size()); - } else { - for (size_t i = 0; i < final_graphs.size(); ++i) { - ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i); - } - } -} - -void ParallelExecutor::BCastParamsToDevices( - const std::vector &vars, int trainer_id) const { - VLOG(3) << "BCastParamsToDevices"; - // the initializing bcast, all vars would be bcast from device(0). - for (auto &var : vars) { - framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); - if (main_var == nullptr || !main_var->IsType()) { - continue; - } - - auto &main_tensor = main_var->Get(); - if (!main_tensor.IsInitialized()) { - VLOG(3) << "one in var not inited, return!"; - continue; - } - auto &dims = main_tensor.dims(); - if (paddle::platform::is_gpu_place(main_tensor.place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - std::vector buffers; - buffers.reserve(member_->places_.size()); - size_t numel = main_tensor.numel(); - ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto place = member_->places_[i]; - void *buffer; - - if (i == 0 && trainer_id == 0) { - buffer = const_cast(main_tensor.data()); - } else { - auto local_scope = member_->local_scopes_[i]; - auto *t = local_scope->Var(var)->GetMutable(); - t->Resize(dims); - buffer = t->mutable_data(place, main_tensor.type()); - } - buffers.push_back(buffer); - } + if (i == 0 && trainer_id == 0) { + buffer = const_cast(main_tensor.data()); + } else { + auto local_scope = member_->local_scopes_[i]; + auto *t = local_scope->Var(var)->GetMutable(); + t->Resize(dims); + buffer = t->mutable_data(place, main_tensor.type()); + } + buffers.push_back(buffer); + } PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), platform::errors::PreconditionNotMet( @@ -1367,6 +1058,399 @@ bool ParallelExecutor::EnableParallelGraphExecution( return enable_parallel_graph; } +void ParallelExecutor::InitExecutorPrivateMemberInfo( + const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, + size_t device_count, const ir::Graph &graph) { + member_->use_device_ = exec_strategy.use_device_; + member_->build_strategy_ = build_strategy; + member_->use_all_reduce_ = member_->build_strategy_.reduce_ == + BuildStrategy::ReduceStrategy::kAllReduce; + member_->nranks_ = build_strategy.num_trainers_ * device_count; + if (!member_->use_all_reduce_ && member_->nranks_ == 1) { + LOG(INFO) << "If you set build_strategy.reduce with 'Reduce'," + "the number of places should be greater than 1."; + member_->build_strategy_.reduce_ = + BuildStrategy::ReduceStrategy::kAllReduce; + member_->use_all_reduce_ = true; + } +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) + if (member_->IsUseCUDA(member_->use_device_)) { + PADDLE_ENFORCE_EQ( + device_count, 1, + platform::errors::Unavailable("Windows can support Single GPU only.")); + } +#endif + +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) + if (member_->IsUseCUDA(member_->use_device_)) { + PADDLE_ENFORCE_EQ( + device_count, 1, + platform::errors::PermissionDenied( + "Your machine has multiple cards, " + "but the WITH_NCCL option is not turned on during compilation, " + "and you cannot use multi-card training or prediction. " + "Please recompile and turn on the WITH_NCCL option.")); + } +#endif + + std::string device_name; + if (member_->use_device_ == p::kCPU) { + device_name = "CPU"; + } else if (member_->use_device_ == p::kCUDA) { + device_name = "CUDA"; + } else { + device_name = "XPU"; + } + + VLOG(1) << string::Sprintf( + "The Program will be executed on %s using ParallelExecutor, %lu " + "cards are used, so %lu programs are executed in parallel.", + device_name, device_count, device_count); + + // FIXME(Yancey1989): parallel graph mode get better performance + // in GPU allreduce distributed training. Need an elegant way to + // choice the execution strategy. + member_->build_strategy_.enable_parallel_graph_ = + EnableParallelGraphExecution(graph, exec_strategy, + member_->build_strategy_); + if (member_->build_strategy_.enable_parallel_graph_) { + LOG(INFO) << "The Executor would execute the graph by ParallelGraph " + "Execution which can get better performance," + << "you can force it off by env FLAGS_enable_parallel_graph=0"; + } +} + +void ParallelExecutor::CreateLocalScopes( + Scope *global_scope, const std::vector &local_scopes, + bool create_new) { + if (local_scopes.empty()) { + member_->own_local_scope_ = true; + member_->local_scopes_.emplace_back(global_scope); + for (size_t i = 1; i < member_->places_.size(); ++i) { + member_->local_scopes_.emplace_back(&global_scope->NewScope()); + } + } else { + member_->own_local_scope_ = false; + PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(), + platform::errors::PreconditionNotMet( + "member_->places_.size() = %d is not equal to " + "local_scopes.size() = %d", + member_->places_.size(), local_scopes.size())); + for (size_t i = 0; i < member_->places_.size(); ++i) { + if (create_new) { + member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); + } else { + // Use local scopes directly + member_->local_scopes_.emplace_back(local_scopes[i]); + } + } + } +} + +std::unordered_map ParallelExecutor::CreateLocalExecScopes( + const std::vector &local_scopes, bool create_new) { + std::unordered_map scope_map; + + for (auto *scope : local_scopes) { + Scope *local_exec_scope = scope; + if (create_new) { + local_exec_scope = &scope->NewScope(); + } + member_->local_exec_scopes_.emplace_back(local_exec_scope); + scope_map.emplace(scope, local_exec_scope); + } + + PADDLE_ENFORCE_EQ( + member_->local_scopes_.size(), member_->local_exec_scopes_.size(), + platform::errors::PreconditionNotMet( + "member_->local_scopes_.size() = %d is not equal to " + "member_->local_exec_scopes_.size() = %d", + member_->local_scopes_.size(), member_->local_exec_scopes_.size())); + + return scope_map; +} + +std::vector ParallelExecutor::CloneGraphToMultiDevices( + ir::Graph *graph) { + std::vector graphs; + if (member_->build_strategy_.async_mode_) { + PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false, + platform::errors::Unavailable( + "gpu mode does not support async_mode_ now!")); + graphs.push_back(graph); + for (size_t i = 1; i < member_->places_.size(); ++i) { + auto *tmp_graph = new ir::Graph(graph->OriginProgram()); + async_graphs_.emplace_back(tmp_graph); + graphs.push_back(tmp_graph); + } + } + + return graphs; +} + +void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { + if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_); + + // Initialize device context's nccl comm, will be used by normal + // Operators like sync_batch_norm, and collective ops. + // NOTE: more than one ParallelExecutor with same place, the nccl comm will + // be rewrite and there will be some problem. + // NOTE: NCCL group-calls and non-group-calls can not use the same + // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use + // same communicators. + auto *nccl_ctxs = member_->nccl_ctxs_->GetSyncBatchNormCtx( + global_scope, member_->places_); + auto &pool = platform::DeviceContextPool::Instance(); + for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { + auto *dev_ctx = static_cast( + pool.Get(member_->places_[dev_id])); + auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); + dev_ctx->set_nccl_comm(nccl_ctx.comm()); + } +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with CUDA.")); +#endif + } + if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) { +#if defined(PADDLE_WITH_XPU_BKCL) + member_->InitOrGetBKCLCommunicator(global_scope, member_->build_strategy_); + + auto *bkcl_ctxs = member_->bkcl_ctxs_->GetSyncBatchNormCtx( + global_scope, member_->places_); + auto &pool = platform::DeviceContextPool::Instance(); + for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { + auto *dev_ctx = static_cast( + pool.Get(member_->places_[dev_id])); + auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); + dev_ctx->set_bkcl_context(bkcl_ctx.comm()); + } +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with XPU.")); +#endif + } +} + +std::vector ParallelExecutor::CompileGraphWithBuildStrategy( + ir::Graph *graph, std::vector *device_graphs, + const std::string &loss_var_name) { + auto device_count = member_->places_.size(); + std::vector async_graphs(device_count); + + auto &graphs = *device_graphs; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (member_->build_strategy_.async_mode_) { + PADDLE_ENFORCE_EQ(graphs.size(), device_count, + platform::errors::PreconditionNotMet( + "graphs.size() shoule be %d, but received %d", + device_count, graphs.size())); + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_device_, + member_->nccl_ctxs_); + for (size_t i = 1; i < device_count; ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_device_, + member_->nccl_ctxs_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_device_, member_->nccl_ctxs_); + } +#elif defined(PADDLE_WITH_XPU_BKCL) + if (member_->build_strategy_.async_mode_) { + PADDLE_ENFORCE_EQ(graphs.size(), device_count, + platform::errors::PreconditionNotMet( + "graphs.size() shoule be %d, but received %d", + device_count, graphs.size())); + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_device_, + member_->bkcl_ctxs_); + for (size_t i = 1; i < device_count; ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_device_, + member_->bkcl_ctxs_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_device_, member_->bkcl_ctxs_); + } +#else + if (member_->build_strategy_.async_mode_) { + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_device_); + for (size_t i = 1; i < device_count; ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_device_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_device_); + } +#endif + + return async_graphs; +} + +void ParallelExecutor::CreateVariableInfos( + std::vector *var_infos, ir::Graph *graph) { + PADDLE_ENFORCE_EQ( + var_infos->size(), 0, + platform::errors::PreconditionNotMet( + "var_infos->size() shoule be 0, but received %d", var_infos->size())); + PADDLE_ENFORCE_EQ( + member_->is_persistable_.size(), 0, + platform::errors::PreconditionNotMet( + "member_->is_persistable_.size() shoule be 0, but received %d", + member_->is_persistable_.size())); + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos->emplace_back(); + var_infos->back().name_ = node->Var()->Name(); + var_infos->back().type_ = node->Var()->GetType(); + var_infos->back().persistable_ = node->Var()->Persistable(); + + member_->is_persistable_.emplace(node->Var()->Name(), + node->Var()->Persistable()); + } + } + + if (graph->Has(details::kFusedVars)) { + auto &fused_vars = graph->Get(details::kFusedVars); + for (auto &fused_var : fused_vars) { + var_infos->emplace_back(); + var_infos->back() = fused_var.second; + + member_->is_persistable_.emplace(fused_var.first, + fused_var.second.persistable_); + } + } +} + +std::vector ParallelExecutor::CreateSSAGraphExecutor( + const ExecutionStrategy &exec_strategy, + std::vector *async_graphs, ir::Graph *graph) { + std::vector final_graphs; + + if (member_->build_strategy_.async_mode_) { + VLOG(3) << "use AsyncSSAGraphExecutor"; + member_->executor_.reset(new details::AsyncSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, *async_graphs)); + final_graphs = *async_graphs; + } else if (member_->build_strategy_.enable_parallel_graph_) { + VLOG(3) << "use ParallelSSAGraphExecutor"; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // TODO(Yancey1989): Remove passing in the main_program when + // allreduce_seq_pass doesn't need it as the attr. + bool is_inference = details::IsDataParallelInferenceGraph(*graph); + bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); + + auto *pg_exe = new details::ParallelSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph); + final_graphs = pg_exe->Graphs(); + member_->executor_.reset(pg_exe); + + if (is_inference && member_->places_.size() > 1) { + member_->inference_executor_ = pg_exe; + if (!has_drop_last_read_op) { + VLOG(5) << "Enable partial feed support in inference phase"; + pg_exe->EnablePartialFeedSupport(); + } + } +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Paddle should be compiled with CUDA for ParallelGraph Execution.")); +#endif + } else { + bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); + auto possible_inference_graphs = + details::TrySeparateToMultipleSingleDeviceGraphs(graph); + if (!possible_inference_graphs.empty()) { + VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase"; + auto *pg_exe = new details::ParallelSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, std::move(possible_inference_graphs)); + if (!has_drop_last_read_op) { + VLOG(5) << "Enable partial feed support in inference phase"; + pg_exe->EnablePartialFeedSupport(); + } + final_graphs = pg_exe->Graphs(); + member_->executor_.reset(pg_exe); + member_->inference_executor_ = pg_exe; + } else { + LOG_IF(WARNING, details::HasKeepLastReadOp(*graph)) + << "drop_last=False for DataLoader is not supported in training " + "network. It is automatically turned to drop_last=True."; + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + VLOG(3) << "use ThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph)); + } else { + VLOG(3) << "use FastThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph)); + } + final_graphs.emplace_back(graph); + } + } + return final_graphs; +} + +void ParallelExecutor::ResetOpHandleScopeMapOfGraphs( + const std::vector &final_graphs, + const std::unordered_map &scope_map) { + PADDLE_ENFORCE_GE( + final_graphs.size(), 1, + platform::errors::PreconditionNotMet( + "final_graphs shoule contain at least one graph, but received %d", + final_graphs.size())); + + PADDLE_ENFORCE_GT(scope_map.size(), 0, + platform::errors::PreconditionNotMet( + "scope_map shoule contain at least one " + "element, but received %d", + scope_map.size())); + for (auto *g : final_graphs) { + auto ops = ir::FilterByNodeWrapper(*g); + for (auto *op : ops) { + op->SetLocalExecScopes(scope_map); + } + } +} + +void ParallelExecutor::SetReaderOpDeviceInfoOfGraphs( + const std::vector &final_graphs) { + if (final_graphs.size() == 1) { + ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size()); + } else { + for (size_t i = 0; i < final_graphs.size(); ++i) { + ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i); + } + } +} + const ir::Graph &ParallelExecutor::Graph() const { return member_->executor_->Graph(); } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 47de7dc48f4f2..d4d0b534b55f0 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_info.h" @@ -41,6 +42,7 @@ namespace framework { class ParallelExecutorPrivate; +using details::VariableInfo; using details::BuildStrategy; using details::ExecutionStrategy; namespace p = paddle::platform; @@ -93,6 +95,40 @@ class ParallelExecutor { const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) const; + void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy, + size_t device_count, + const ir::Graph &graph); + + void CreateLocalScopes(Scope *global_scope, + const std::vector &local_scopes, + bool create_new); + + std::unordered_map CreateLocalExecScopes( + const std::vector &local_scopes, bool create_new); + + std::vector CloneGraphToMultiDevices(ir::Graph *graph); + + void PrepareNCCLCommunicator(Scope *global_scope); + + std::vector CompileGraphWithBuildStrategy( + ir::Graph *graph, std::vector *graphs, + const std::string &loss_var_name); + + void CreateVariableInfos(std::vector *var_infos, + ir::Graph *graph); + + std::vector CreateSSAGraphExecutor( + const ExecutionStrategy &exec_strategy, + std::vector *async_graphs, ir::Graph *graph); + + void ResetOpHandleScopeMapOfGraphs( + const std::vector &final_graphs, + const std::unordered_map &scope_map); + + void SetReaderOpDeviceInfoOfGraphs( + const std::vector &final_graphs); + ParallelExecutorPrivate *member_; std::vector> async_graphs_; }; diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 5968df548dfb0..3649e00e7c9d8 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(WITH_ASCEND_CL) #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" @@ -34,7 +35,11 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ParseDumpConfig(trainer_desc); const auto& section_config = section_params.section_config(); int place_id = section_config.place_id(); +#if (defined PADDLE_WITH_NCCL) place_ = platform::CUDAPlace(place_id); +#elif (defined WITH_ASCEND_CL) + place_ = platform::NPUPlace(place_id); +#endif worker_ = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name()); auto this_worker = diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index e77932fa5f226..39bc3f040639b 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -19,10 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker_factory.h" -#include "paddle/fluid/framework/fleet/fleet_wrapper.h" -#include "paddle/fluid/framework/fleet/heter_context.h" -#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" -#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/trainer.h" #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ (defined PADDLE_WITH_PSLIB) @@ -64,7 +60,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, pull_dense_worker_ = PullDenseWorker::GetInstance(); pull_dense_worker_->Initialize(trainer_desc); SetDebug(trainer_desc.debug()); - fleet_ptr_ = FleetWrapper::GetInstance(); trainer_desc_ = trainer_desc; workers_.resize(place_num); for (int i = 0; i < place_num; ++i) { diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index 2597901d91f36..d178c4e556ca5 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker_factory.h" -#include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/fleet/heter_wrapper.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index e740771e5ca9f..7860b69313e7b 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -9,7 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(WITH_ASCEND_CL) #include #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/executor_gc_helper.h" diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 4c30c40ad5837..7e48d0dc5f962 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -113,6 +113,21 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, TensorToStream(os, selected_rows.value(), dev_ctx); } +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext* dev_ctx; + auto place = selected_rows.place(); + dev_ctx = pool.Get(place); + SerializeToStream(os, selected_rows, *dev_ctx); +} + +void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext* dev_ctx; + dev_ctx = pool.Get(platform::CPUPlace()); + DeserializeFromStream(os, selected_rows, *dev_ctx); +} + void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, const platform::DeviceContext& dev_ctx) { { diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index 48353b43f56ca..e53e3d973c524 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -173,5 +173,9 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, const platform::DeviceContext& dev_ctx); +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows); + +void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index d6882b25d2258..78fd1af09e294 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -822,6 +822,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, #else PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); +#endif + } else if (platform::is_npu_place(tensor.place())) { +#ifdef PADDLE_WITH_ASCEND_CL + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& npu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + BOOST_GET_CONST(platform::NPUPlace, tensor.place()), + reinterpret_cast(data), size_to_write, + npu_dev_ctx.stream()); + npu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); #endif } else { os.write(static_cast(data_ptr), @@ -877,9 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor, auto ctx = platform::CPUDeviceContext(); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || - platform::is_xpu_place(dev_ctx.GetPlace())) { + platform::is_xpu_place(dev_ctx.GetPlace()) || + platform::is_npu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(shape)); framework::VisitDataType( @@ -888,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); + if (platform::is_npu_place(dev_ctx.GetPlace())) { + dev_ctx.Wait(); + } #else if (platform::is_gpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported when not compiled with CUDA")); - } else { + } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); } #endif } else { @@ -935,9 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor, auto ctx = platform::CPUDeviceContext(); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || - platform::is_xpu_place(dev_ctx.GetPlace())) { + platform::is_xpu_place(dev_ctx.GetPlace()) || + platform::is_npu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( @@ -946,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); + if (platform::is_npu_place(dev_ctx.GetPlace())) { + dev_ctx.Wait(); + } #else if (platform::is_gpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported when not compiled with CUDA")); - } else { + } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); } #endif } else { diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 868d920f13ca8..22c8e1c1665f1 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -135,6 +135,7 @@ void TensorFromArray(const T* src, const size_t& array_size, } #endif } + template void TensorFromVector(const std::vector& src, const platform::DeviceContext& ctx, Tensor* dst) { @@ -158,13 +159,58 @@ void TensorFromVector(const std::vector& src, } #endif #ifdef PADDLE_WITH_ASCEND_CL + // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from + // cudaMemcpyAsync. + // cudaMemcpyAsync is actually "sync" between cpu <-> gpu. + // aclrtMemcpyAsync is really "async" between cpu <-> npu. + // Since vector is on cpu, I think this function should be a "sync" operation, + // so pass nullptr as stream to memory::Copy(). else if (platform::is_npu_place(dst_place)) { // NOLINT + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); + } +#endif +} + +// The fully specialized function should be inline to avoid +// multi-definition. +template <> +inline void TensorFromVector(const std::vector& src, + const platform::DeviceContext& ctx, Tensor* dst) { + // vector has no data() member, use array instead. + // See details: + // https://stackoverflow.com/questions/46115669/why-does-stdvectorbool-have-no-data/46115714 + bool* array = new bool[src.size()]; + for (unsigned int i = 0; i < src.size(); i++) { + array[i] = static_cast(src[i]); + } + + auto dst_place = ctx.GetPlace(); + auto src_ptr = static_cast(array); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(bool); + + if (platform::is_cpu_place(dst_place)) { + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, + BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, src_ptr, size, - reinterpret_cast(ctx).stream()); + reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(dst_place)) { // NOLINT + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); + } +#endif + delete[] array; } template @@ -179,6 +225,23 @@ void TensorFromVector(const std::vector& src, Tensor* dst) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } +template <> +inline void TensorFromVector(const std::vector& src, Tensor* dst) { + bool* array = new bool[src.size()]; + for (unsigned int i = 0; i < src.size(); i++) { + array[i] = static_cast(src[i]); + } + platform::CPUPlace dst_place = platform::CPUPlace(); + auto src_ptr = static_cast(array); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(bool); + + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + delete[] array; +} + template void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, std::vector* dst) { @@ -204,12 +267,50 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); + } +#endif +} + +template <> +inline void TensorToVector(const Tensor& src, + const platform::DeviceContext& ctx, + std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(bool); + + bool* array = new bool[src.numel()]; + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(array); + + if (platform::is_cpu_place(src.place())) { + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, + size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), + dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), src_ptr, size, - reinterpret_cast(ctx).stream()); + reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(src.place())) { // NOLINT + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); + } +#endif + for (unsigned int i = 0; i < src.numel(); i++) { + (*dst)[i] = static_cast(array[i]); + } + delete[] array; } template @@ -231,6 +332,32 @@ void TensorToVector(const Tensor& src, std::vector* dst) { BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); } +template <> +inline void TensorToVector(const Tensor& src, std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(bool); + + bool* array = new bool[src.numel()]; + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(array); + + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(src.place()), true, + platform::errors::InvalidArgument( + "The input tensor should be CPU device, but actually it is in %s.", + src.place())); + + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); + + for (unsigned int i = 0; i < src.numel(); i++) { + (*dst)[i] = static_cast(array[i]); + } + delete[] array; +} + std::ostream& operator<<(std::ostream& os, const Tensor& t); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index c32efd0a470be..8587ee8d1e919 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -242,6 +242,61 @@ TEST(TensorToVector, Tensor) { #endif } +TEST(TensorToVector, Tensor_bool) { + { + paddle::framework::Tensor src; + bool* src_ptr = + src.mutable_data({3, 3}, paddle::platform::CPUPlace()); + for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = static_cast(i % 2); + } + + paddle::platform::CPUPlace place; + std::vector dst; + paddle::framework::TensorToVector(src, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); + } + } +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = { + false, true, false, true, false, true, false, true, false, + }; + paddle::framework::Tensor gpu_tensor; + paddle::platform::CUDAPlace place; + paddle::platform::CUDADeviceContext gpu_ctx(place); + paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(gpu_tensor, gpu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + { + std::vector src_vec = { + false, true, false, true, false, true, false, true, false, + }; + paddle::framework::Tensor npu_tensor; + paddle::platform::NPUPlace place(0); + paddle::platform::NPUDeviceContext npu_ctx(place); + paddle::framework::TensorFromVector(src_vec, npu_ctx, &npu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(npu_tensor, npu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +} + TEST(TensorFromDLPack, Tensor) { { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index ca290a50b42fe..01aa07e618464 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -109,13 +109,22 @@ class MultiTrainer : public TrainerBase { virtual Scope* GetWorkerScope(int thread_id); virtual std::string GetDumpPath(int tid); + template + void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); +#ifdef PADDLE_WITH_HETERPS + + void MergeDenseParam(); +#endif + protected: int thread_num_; std::vector threads_; std::vector readers_; std::vector> workers_; std::vector need_merge_var_names_; - +#ifdef PADDLE_WITH_HETERPS + std::vector places_; +#endif int mpi_rank_; int mpi_size_; int dump_file_num_; @@ -313,7 +322,6 @@ class PSGPUTrainer : public TrainerBase { float scale_datanorm_; paddle::platform::Place place_; ProgramDesc program_; - std::shared_ptr fleet_ptr_; std::shared_ptr pull_dense_worker_; std::vector> workers_; std::vector places_; @@ -324,7 +332,8 @@ class PSGPUTrainer : public TrainerBase { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(WITH_ASCEND_CL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index a2b5a98401e23..e43cccfe64816 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -45,6 +45,17 @@ using Attribute = boost::variant< using AttributeMap = std::unordered_map; +#ifdef PADDLE_WITH_ASCEND_CL +using NPUAttribute = + boost::variant, + std::vector, std::vector, bool, + std::vector, BlockDesc*, int64_t, + std::vector, std::vector, + std::vector, std::vector>>; + +using NPUAttributeMap = std::unordered_map; +#endif + using OpCreator = std::function; diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index fc754cbaf177c..473df85aa0421 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -36,6 +36,11 @@ #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include +#endif + #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif @@ -50,6 +55,10 @@ class Communicator; class NCCLCommunicator; #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL +class Communicator; +class HCCLCommunicator; +#endif #if defined(PADDLE_WITH_XPU_BKCL) class BKCLCommunicator; @@ -162,6 +171,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #endif operators::CudnnRNNCache, #endif +#if defined(PADDLE_WITH_ASCEND_CL) + HcclRootInfo, +#endif #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, platform::BKCLCommunicator, #endif diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index a24c0ac09c758..6bee3d44b2edd 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -4,7 +4,7 @@ cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) add_subdirectory(jit) cc_library(amp SRCS amp_auto_cast.cc DEPS layer ) -cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp) +cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal) cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) cc_library(imperative_profiler SRCS profiler.cc) diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 2a439a6f1ea81..d5350744e4c55 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -284,15 +284,15 @@ static std::shared_ptr> CallGradientHooks( for (const auto& pair : bwd_ins) { for (size_t i = 0; i < pair.second.size(); ++i) { auto& var = pair.second[i]; - if (var->HasHook()) { + if (var->HasVariableWrapperHook()) { if (tmp_ins_ptr == nullptr) { tmp_ins_ptr = std::make_shared>(bwd_ins); } - VLOG(3) << "Call " << var->GetHooks().size() << " hooks of " << op_type - << "'s input `" << pair.first << "`'s var `" << var->Name() - << "`."; + VLOG(3) << "Call " << var->GetVariableWrapperHooks().size() + << " hooks of " << op_type << "'s input `" << pair.first + << "`'s var `" << var->Name() << "`."; auto tmp_var = var; - for (const auto& hook_pair : var->GetHooks()) { + for (const auto& hook_pair : var->GetVariableWrapperHooks()) { tmp_var = (*hook_pair.second)(tmp_var); } (*tmp_ins_ptr)[pair.first][i] = tmp_var; diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h index a367840472827..7fefc9ccc67b5 100644 --- a/paddle/fluid/imperative/dygraph_grad_maker.h +++ b/paddle/fluid/imperative/dygraph_grad_maker.h @@ -279,6 +279,8 @@ class TracedGradOp { void SetType(const std::string& type) { op_->SetType(type); } + const framework::OperatorBase& InnerOp() const { return op_->InnerOp(); } + void SetAttrMap(const framework::AttributeMap& attrs) { return op_->SetAttrMap(attrs); } diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 64f5a9e0cc877..43546cf99c69f 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -467,14 +467,14 @@ void GradientAccumulator::CallGradientHooks() { platform::errors::PreconditionNotMet("Leaf Tensor's inner var " "is not initialized when " "call gradient hook.")); - if (var_->HasHook()) { - VLOG(3) << "Call " << var_->GetHooks().size() + if (var_->HasVariableWrapperHook()) { + VLOG(3) << "Call " << var_->GetVariableWrapperHooks().size() << " hooks of leaf gradient accumulator's inner var `" << var_->Name() << "`."; auto tmp_var = inner_var_; VLOG(3) << "Input var " << var_->Name() << "'s hook size - " - << var_->GetHooks().size(); - for (const auto& hook_pair : var_->GetHooks()) { + << var_->GetVariableWrapperHooks().size(); + for (const auto& hook_pair : var_->GetVariableWrapperHooks()) { tmp_var = (*hook_pair.second)(tmp_var); } inner_var_ = tmp_var; @@ -495,10 +495,10 @@ void GradientAccumulator::CallReduceHooks() { "Only can call reduce hooks after the " "gradient accumulation is completed in " "current batch or across batchs.")); - if (var_->HasMutableHook()) { - for (const auto& hook : var_->GetMutableHooks()) { + if (var_->HasVoidHook()) { + for (const auto& hook : var_->GetVoidHooks()) { VLOG(3) << "call gradient accumulator backward hooks."; - (*hook)(var_); + (*hook)(); } } } diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h index 4d59298aed51f..fa929b7c7a51c 100644 --- a/paddle/fluid/imperative/hooks.h +++ b/paddle/fluid/imperative/hooks.h @@ -23,32 +23,34 @@ namespace imperative { class VariableWrapper; -/** [ Const VariableWrapper Hook: Pre hook functor of OpBase ] +/** [ VariableWrapper Hook ] * - * @brief This hook functor is executed before the grad OpBase is executed, - * taking the input of the current grad OpBase as input, and - * executing python hooks (user-defined) or C++ hooks (developer-defined) - * to achieve the purpose of custom operations on the interior VarBase - * gradient. + * @brief This hook functor is executed before the grad OpBase is executed or + * after gradient accumulation completed in current batch. + * 1. For interior var, VariableWrapper Hook take the input of the + * current grad OpBase as input. + * 2. For leaf var, VariableWrapper Hook take the inner_var_ of + * GradientAccumulator as input. * - * @note This hook functor will not change the input gradient VarBase. + * @note This hook functor will not change the input gradient VariableWrapper, + * but if you copy the input VariableWrapper and change the value of + * Variable in VariableWrapper, the value of input will also be changed, + * because they shared same PlaceHolder. * - * @note [Why need to be OpBase `PreHook`, why not `PostHook`?] + * @note [ Why need to be OpBase `PreHook`, why not `PostHook`? ] * - * 1. We expect If set OpBase post hook, when the op executed end, the + * We expect If set OpBase post hook, when the op executed end, the * op's output gradient may not be the final state, because it may need * other op's gradient output to accumulated to it. But before op can * be executed, the gradient output must have been accumulated to final * value. - * 2. We don’t want the hook to change its input Tensor value, so now - * we can't call all hooks in GradAccumulator. * - * @note [Why only can be used for interior VarBase?] + * @note [ Why Leaf gradient is special? ] * * Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf * GradVarBase has no next OpBase to executed, so if need to deal with - * the leaf GradVarBase, cannot use this hook functor. For this case, we - * deal with by other inplace hook method. + * the leaf GradVarBase, we should call hooks after gradient accumulation + * completed. */ class VariableWrapperHook { public: @@ -57,34 +59,22 @@ class VariableWrapperHook { const std::shared_ptr& var) = 0; }; -/** [ Inplace VariableWrapper Hook: Post hook functor of GradAccumulator ] - * - * @brief This hook functor is the Hook that operates on the current - * gradientafter the GradientAccumulator has accumulated the gradient. - * Leaf GradVarBase has no next OpBase, if we want to register hook - * for it, we also need to wait until the leaf GradVarBase accumulation - * is completed, so we can add post hook to GradientAccumulator. - * - * @note This hook functor will change the grad VarBase value. - * - * @note Only allow leaf VarBase hold call this hook functor. - */ -class InplaceVariableWrapperHook { - public: - virtual ~InplaceVariableWrapperHook() = default; - virtual void operator()(VariableWrapper* var) = 0; -}; - -class LambdaInplaceVariableWrapperHook : public InplaceVariableWrapperHook { +class CppVariableWrapperHook : public VariableWrapperHook { public: - explicit LambdaInplaceVariableWrapperHook( - std::function&& fn) + explicit CppVariableWrapperHook( + std::function( + const std::shared_ptr&)>&& fn) : fn_(std::move(fn)) {} - void operator()(VariableWrapper* var) override { fn_(var); } + std::shared_ptr operator()( + const std::shared_ptr& var) override { + return fn_(var); + } private: - std::function fn_; + std::function( + const std::shared_ptr&)> + fn_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 062f04c6b7052..a4af3117d3e32 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -187,6 +187,7 @@ size_t VarBase::GradOpNum() const { } void VarBase::ClearGradient() { + VLOG(4) << "ClearGradient " << Name(); if (grad_var_) { if (grad_var_->Var().IsType()) { auto* grad_t = @@ -406,7 +407,7 @@ void OpBase::Run(const framework::OperatorBase& op, OpBaseRunImpl(op, ins, outs, attrs, place); } -static void ClearNoNeedBufferInputs(OpBase* op) { +void ClearNoNeedBufferInputs(OpBase* op) { auto& inferer = op->Info().NoNeedBufferVarsInferer(); if (!inferer) return; auto* ins = op->GetMutableInsMap(); diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index f87db415768a1..bbede47e36429 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -108,6 +108,10 @@ class VarBase { void ClearGradVarBase() { grad_var_ = nullptr; } + void SetGradVarBase(VarBase& grad_var) { + MutableGradVarBase()->CopyFrom(grad_var, true); + } + const std::shared_ptr& MutableGradVarBase() { if (grad_var_ == nullptr) { if (auto grad_var_wrapper = var_->GetGradVar()) { @@ -222,23 +226,25 @@ class VarBase { void BumpInplaceVersion(); /* Hook related method: now only used for GradVarBase */ - bool HasHook() const { return var_->HasHook(); } + bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); } - int64_t AddHook(std::shared_ptr&& hook) { - return var_->AddHook( + int64_t AddVariableWrapperHook(std::shared_ptr&& hook) { + return var_->AddVariableWrapperHook( std::forward>(hook)); } - bool RemoveHook(const int64_t& hook_id) { return var_->RemoveHook(hook_id); } + bool RemoveVariableWrapperHook(const int64_t& hook_id) { + return var_->RemoveVariableWrapperHook(hook_id); + } - const std::map>& GetHooks() - const { - return var_->GetHooks(); + const std::map>& + GetVariableWrapperHooks() const { + return var_->GetVariableWrapperHooks(); } - void AddMutableHook(std::shared_ptr&& hook) { - var_->AddMutableHook( - std::forward>(hook)); + void AddVoidHook(std::shared_ptr>&& hook) { + var_->AddVoidHook( + std::forward>>(hook)); } private: @@ -280,5 +286,7 @@ std::shared_ptr CreateGradOpNode( const platform::Place& place, const std::map& inplace_map); +void ClearNoNeedBufferInputs(OpBase* op); + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h new file mode 100644 index 0000000000000..bd132f2576fec --- /dev/null +++ b/paddle/fluid/imperative/py_layer_fwd.h @@ -0,0 +1,172 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/tracer.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/operators/py_layer_op.h" + +namespace paddle { +namespace imperative { + +namespace py = ::pybind11; + +bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) { + for (const auto& name_pair : ins) { + for (const auto& var_base : name_pair.second) { + if (!var_base->OverridedStopGradient()) { + PassStopGradient(outs, var_base->OverridedStopGradient()); + return true; + } + } + } + return false; +} + +std::shared_ptr CreateGradOpNode( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, const framework::AttributeMap& attrs, + const platform::Place& place, + const std::map& inplace_map, + const std::shared_ptr& py_context) { + operators::PyLayerGradOpMaker maker( + type, ins, outs, attrs, inplace_map); + + maker.SetPyLayerContext(py_context); + auto grad_node = maker(); + if (grad_node && !grad_node->empty()) { + for (auto& grad_op : *grad_node) { + grad_op.SetId(OpBase::GenerateUniqueId()); + grad_op.SetPlace(place); + ClearNoNeedBufferInputs(&grad_op); + } + return grad_node; + } else { + return nullptr; + } +} + +py::object PyLayerApply(const platform::Place& place, const py::object& cls, + const py::args args, const py::kwargs kwargs) { + auto bk_function = cls.attr("_backward_function"); + auto context = bk_function(); + auto forward = cls.attr("forward"); + + auto result_forward = forward(context, *args, **kwargs); + std::shared_ptr py_layer_ctx = + std::make_shared(context.release().ptr()); + // make inputs to varbase + std::vector> input_vars; + // process args,`input_vars` only collect `imperative::VarBase` + if (!args.empty()) { + for (auto ptr = args.begin(); ptr != args.end(); ptr++) { + try { + if (Py_None != ptr->ptr()) { + auto a = ptr->cast>(); + input_vars.push_back(a); + } + } catch (py::cast_error& err) { + // Only collect Tensor type in 'args' and pass them to backward. Ignore + // other types of input temporarily. + } + } + } + // process kwargs, only collect `imperative::VarBase` + if (!kwargs.empty()) { + for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) { + try { + if (Py_None != ptr->second.ptr()) { + auto a = ptr->second.cast>(); + input_vars.push_back(a); + } + } catch (py::cast_error&) { + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. + } + } + } + NameVarBaseMap ins = {{"X", input_vars}}; + + std::vector> output_vars; + if (PyTuple_Check(result_forward.ptr()) || + PyList_Check(result_forward.ptr())) { + auto tuple_result = result_forward.cast(); + for (size_t i = 0; i < tuple_result.size(); i++) { + if (Py_None != tuple_result[i].ptr()) { + try { + auto temp_out = + tuple_result[i].cast>(); + output_vars.push_back(temp_out); + } catch (py::cast_error&) { + PADDLE_THROW(platform::errors::Unimplemented( + "The output of `PyLayer.forward` should be `Tensor`.")); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "The output of `PyLayer.forward` can not be `None`.")); + } + } + } else { + if (Py_None != result_forward.ptr()) { + try { + auto temp_out = + result_forward.cast>(); + output_vars.push_back(temp_out); + } catch (py::cast_error&) { + PADDLE_THROW(platform::errors::Unimplemented( + "The output of `PyLayer.forward` should be `Tensor`.")); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "The output of `PyLayer.forward` can not be `None`.")); + } + } + + NameVarBaseMap outs = {{"Out", output_vars}}; + + if (RequiredGrad(ins, outs)) { + std::map inplace_map{}; + bool if_inplace = false; + for (auto temp_ins : input_vars) { + if (if_inplace) { + break; + } + for (auto temp_outs : output_vars) { + if (temp_ins->Name() == temp_outs->Name()) { + if_inplace = true; + break; + } + } + } + if (if_inplace) { + inplace_map["X"] = "Out"; + } + + CreateGradOpNode("py_layer", ins, outs, {{}}, place, inplace_map, + py_layer_ctx); + } else { + VLOG(3) << "No Grad to track for Op: py_layer_op"; + } + + return result_forward; +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 5422b7ce9c855..a92704ce447dc 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -310,9 +310,8 @@ Reducer::Reducer(const std::vector> &vars, for (size_t global_var_index = 0; global_var_index < vars_.size(); ++global_var_index) { auto var = vars_[global_var_index]; - var->GradVarBase()->AddMutableHook( - std::make_shared([=]( - VariableWrapper *grad) { this->AddDistHook(global_var_index); })); + var->GradVarBase()->AddVoidHook(std::make_shared>( + [=]() { this->AddDistHook(global_var_index); })); var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index; } diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 8c907b9890652..5c4e1538cf053 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -37,6 +37,30 @@ namespace imperative { using vb_vector = std::vector>; using var_pair = std::pair; +std::shared_ptr DoubleHook( + const std::shared_ptr& var) { + // 1. create out var + auto out_var = std::make_shared(var->Name()); + out_var->SetType(var->Type()); + out_var->SetDataType(var->DataType()); + out_var->SetForwardDataType(var->ForwardDataType()); + out_var->InnerSetOverridedStopGradient(var->InnerOverridedStopGradient()); + + // 2. get input and output var's tensor + auto* out_tensor = out_var->MutableVar()->GetMutable(); + auto& tensor = var->Var().Get(); + out_tensor->Resize(tensor.dims()); + + // 3. double calc + auto* data = tensor.data(); + auto* out_data = out_tensor->mutable_data(platform::CPUPlace()); + for (int64_t i = 0; i < out_tensor->numel(); ++i) { + out_data[i] = data[i] * 2.0; + } + + return out_var; +} + TEST(TestHooks, TestGradVarLeafBackwardHook) { // 1. prepare Tracer tracer; @@ -73,16 +97,14 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) { framework::AttributeMap mul_attr_map; mul_attr_map["use_mkldnn"] = false; - // add GradAccumulatorPostHook - x->GradVarBase()->AddMutableHook( - std::make_shared( - [=](VariableWrapper* grad) { - auto* grad_tensor = - grad->MutableVar()->GetMutable(); - for (int i = 0; i < grad_tensor->numel(); ++i) { - grad_tensor->mutable_data(place)[i] *= 2.0; - } - })); + // add VariableWrapper hook + x->GradVarBase()->AddVariableWrapperHook( + std::make_shared(DoubleHook)); + + // add Void hook + int64_t hook_value = 0; + x->GradVarBase()->AddVoidHook( + std::make_shared>([&]() { hook_value = 10; })); // 2. forward tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); @@ -98,12 +120,15 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) { engine.Init(tensors, grad_tensors); engine.Execute(); + // verify VariableWrapper hook result framework::LoDTensor x_grad; framework::TensorCopySync(x->GradVar().Get(), place, &x_grad); for (int i = 0; i < x_grad.numel(); ++i) { ASSERT_EQ(x_grad.data()[i], 8.0); } + // verify Void hook result + ASSERT_EQ(hook_value, 10); framework::LoDTensor y_grad; framework::TensorCopySync(y->GradVar().Get(), place, @@ -152,16 +177,14 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() { memory::Copy(place, mutable_z, place, src_data.data(), sizeof(float) * src_data.size()); - // add ReduceBackwardHook - x->GradVarBase()->AddMutableHook( - std::make_shared( - [=](VariableWrapper* grad) { - auto* grad_tensor = - grad->MutableVar()->GetMutable(); - for (int i = 0; i < grad_tensor->numel(); ++i) { - grad_tensor->mutable_data(place)[i] *= 2.0; - } - })); + // add VariableWrapper hook + x->GradVarBase()->AddVariableWrapperHook( + std::make_shared(DoubleHook)); + + // add Void hook + int64_t hook_value = 0; + x->GradVarBase()->AddVoidHook( + std::make_shared>([&]() { hook_value = 100; })); // 2. forward var_pair x_pair = var_pair("X", vb_vector(1, x)); @@ -199,12 +222,15 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() { engine.Init(tensors, grad_tensors); engine.Execute(); + // verify VariableWrapper hook result framework::LoDTensor x_grad; framework::TensorCopySync(x->GradVar().Get(), place, &x_grad); for (int i = 0; i < x_grad.numel(); ++i) { ASSERT_EQ(x_grad.data()[i], 16.0); } + // verify Void hook result + ASSERT_EQ(hook_value, 100); framework::LoDTensor y_grad; framework::TensorCopySync(y->GradVar().Get(), place, diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 608cc407d5b77..742514c0910a2 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/op_base.h" +#include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/string_helper.h" @@ -38,7 +39,7 @@ void SetCurrentTracer(const std::shared_ptr& tracer) { VLOG(6) << "Set current tracer: " << g_current_tracer; } -static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) { +void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) { for (const auto& pair : outs) { for (const auto& var : pair.second) { // NOTE(zhiqiu): this happends when None output are passed from python @@ -134,6 +135,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, const platform::Place& place, bool trace_backward, const std::map& inplace_map) { platform::RecordEvent op_type_record_event(type); + platform::ScopedFlushDenormal flush; VLOG(1) << "Trace Op: " << type; if (FLAGS_use_mkldnn) { // if both lists are empty all ops are enabled (default for diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index b10d1b2d0b49d..8f50550878262 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -130,5 +130,7 @@ void IncreaseVarbaseReferenceCountUntilCopyComplete( const std::shared_ptr& var, const platform::Place& place); +void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad); + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index 7d287c9829104..5fa8b89a396d9 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -38,6 +38,9 @@ class VariableWrapper { explicit VariableWrapper(const std::string& name) : name_(name) {} + VariableWrapper(const std::string& name, const framework::Variable& variable) + : var_(variable), name_(name) {} + ~VariableWrapper() { VLOG(10) << "Destruct VariableWrapper: " << Name(); } const framework::Variable& Var() const { return var_; } @@ -220,35 +223,35 @@ class VariableWrapper { } /* Hook related methods */ - bool HasHook() const { return !hooks_.empty(); } - - bool HasMutableHook() const { return !mutable_hooks_.empty(); } + bool HasVariableWrapperHook() const { return !var_hooks_.empty(); } - int64_t AddHook(std::shared_ptr&& hook) { - hooks_.emplace(next_hook_id_, std::move(hook)); + int64_t AddVariableWrapperHook(std::shared_ptr&& hook) { + var_hooks_.emplace(next_hook_id_, std::move(hook)); return next_hook_id_++; } - bool RemoveHook(const int64_t& hook_id) { - auto remove_cnt = hooks_.erase(hook_id); + bool RemoveVariableWrapperHook(const int64_t& hook_id) { + auto remove_cnt = var_hooks_.erase(hook_id); if (remove_cnt == 0) { return false; } return true; } - const std::map>& GetHooks() - const { - return hooks_; + const std::map>& + GetVariableWrapperHooks() const { + return var_hooks_; } - void AddMutableHook(std::shared_ptr&& hook) { - mutable_hooks_.emplace_back(std::move(hook)); + bool HasVoidHook() const { return !void_hooks_.empty(); } + + void AddVoidHook(std::shared_ptr>&& hook) { + void_hooks_.emplace_back(std::move(hook)); } - const std::vector>& - GetMutableHooks() const { - return mutable_hooks_; + const std::vector>>& GetVoidHooks() + const { + return void_hooks_; } private: @@ -319,14 +322,19 @@ class VariableWrapper { // isn't need bool is_empty_{false}; - // NOTE(chenweihang): only grad var can hold hooks now + // NOTE(chenweihang): only grad var will hold hooks now int64_t next_hook_id_{0}; - // Hooks used to register hook for grad var, support adding and removing, + // [ Hooks with VariableWrapper as input and output ] + // NOTE: Now registered for grad var, support adding and removing, // key is the accumulated int64_t value - std::map> hooks_; - // Hooks executed after the execution of the entire backward process is over, - // currently only supported for reducing in distributed training - std::vector> mutable_hooks_; + // NOTE: Var hook need to support removing, so need hook id + std::map> var_hooks_; + // [ Hooks without input and output ] + // NOTE: Now registered after the execution of the entire backward + // process is over, currently only used for reducing in distributed + // training + // NOTE: Now no need to support remove void hook + std::vector>> void_hooks_; }; } // namespace imperative diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 93fd85f13cbf0..c002c7a10cb7b 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -33,7 +33,7 @@ if (WITH_LITE) add_subdirectory(lite) endif() -# fluid_modules exclude API-interface of inference/api and inference/capi +# fluid_modules exclude API-interface of inference/api and inference/capi_exp get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) # Adapt to custom op mechanism: Include the header files related to the data type @@ -61,7 +61,7 @@ if(NOT APPLE) endif() # C inference API -add_subdirectory(capi) +add_subdirectory(capi_exp) if(WITH_TESTING AND WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index bd27b1f5f3447..255c6ca75dfd7 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -213,6 +213,11 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool); DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool); + DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool); + DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int); + DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int); + DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter, std::vector); DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector); @@ -222,6 +227,11 @@ struct Argument { DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool); DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int); + DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool); + DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool); + DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string); + DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string); + DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index a4e263e2f464c..8407f98e6dfd9 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -166,6 +166,11 @@ void IRPassManager::CreatePasses(Argument *argument, // run fp16. pass->Set("disable_trt_plugin_fp16", new bool(argument->disable_trt_plugin_fp16())); + } else if (pass_name == "dlnne_subgraph_pass") { + pass->Set("min_subgraph_size", + new int(argument->dlnne_min_subgraph_size())); + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); } if (pass_name == "lite_subgraph_pass") { bool enable_int8 = @@ -183,6 +188,12 @@ void IRPassManager::CreatePasses(Argument *argument, new int(argument->xpu_l3_workspace_size())); pass->Set("cpu_math_library_num_threads", new int(argument->cpu_math_library_num_threads())); + pass->Set("locked", new bool(argument->xpu_locked())); + pass->Set("autotune", new bool(argument->xpu_autotune())); + pass->Set("autotune_file", + new std::string(argument->xpu_autotune_file())); + pass->Set("precision", new std::string(argument->xpu_precision())); + pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen())); } disable_logs_ = argument->disable_logs(); if (pass_name == "fc_fuse_pass") { diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index e35178428cc7b..330f7a9984734 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -20,3 +20,15 @@ if (WITH_LITE) set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "") cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog) endif() + +MESSAGE("WITH_DLNNE:${WITH_DLNNE}") +if(WITH_DLNNE) + cc_library(dlnne_subgraph_pass SRCS dlnne_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util) + set(analysis_deps ${analysis_deps} + subgraph_util dlnne_subgraph_pass + CACHE INTERNAL "") + + set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp) + file(APPEND ${pass_file} "USE_PASS(dlnne_subgraph_pass);\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} dlnne_subgraph_pass CACHE INTERNAL "") +endif() diff --git a/paddle/fluid/operators/distributed/large_scale_kv.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h similarity index 64% rename from paddle/fluid/operators/distributed/large_scale_kv.cc rename to paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h index d2673ed6ffb36..ae977c1403a87 100644 --- a/paddle/fluid/operators/distributed/large_scale_kv.cc +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,16 +11,11 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -#include "paddle/fluid/operators/distributed/large_scale_kv.h" +#pragma once namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag LargeScaleKV::init_flag_; -std::shared_ptr LargeScaleKV::scale_kv_(nullptr); +namespace inference { -} // namespace distributed -} // namespace operators +int RegisterPyFunc(const std::string& name, void* pfn); +} // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc new file mode 100644 index 0000000000000..8f789139af9bf --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc @@ -0,0 +1,351 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include + +#include +#include + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/subgraph_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h" +#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { + +int (*PyConvertGraph)(const char *graph_name); + +int RegisterPyFunc(const std::string &name, void *pfn) { + if (name.compare("convert_graph") == 0) { + PyConvertGraph = reinterpret_cast(pfn); + } + + return 0; +} +int ConvertGraph(std::string graph_name) { + LOG(INFO) << "starting doing convert_graph"; + + PyConvertGraph(graph_name.c_str()); + + return 0; +} + +namespace analysis { + +using framework::ir::Node; + +void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const { + static std::unordered_set teller_set{ + "mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", + "hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", + "elementwise_add", "elementwise_mul", "dropout", "prelu", + "conv2d_transpose", "leaky_relu", + // "fc", + "shuffle_channel", "swish", "split", + // "instance_norm", + "gelu", + // "layer_norm", + // "scale", + // "stack", + "relu6", "reshape2", "transpose2", "concat", "slice", + }; + + framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph); + + auto teller = [&](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) return false; + return teller_set.find(node->Op()->Type()) != teller_set.end(); + }; + + framework::ir::SubGraphFuser fuser( + graph, teller, Get("min_subgraph_size") /*min subgraph size*/, + "dlnne_engine"); + fuser(); + + std::vector graph_param_names = + ExtractParameters(graph->Nodes()); + // those parameter already exist in dlnne, and should not have another copy in + // fluid. + std::vector repetitive_params; + + for (auto *node : graph->Nodes()) { + if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) { + CreateDlnneOp(node, graph, graph_param_names, &repetitive_params); + + std::unordered_set nodes2remove( + framework::ir::Agent(node).subgraph()->begin(), + framework::ir::Agent(node).subgraph()->end()); + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); + } + } + + std::unordered_set nodes2remove; + for (auto *node : graph->Nodes()) { + if (node->IsOp() && framework::ir::Agent(node).deleted()) { + nodes2remove.insert(node); + } + } + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); +} + +std::string GenerateEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs, + const std::string &predictor_id) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + engine_hash_key += predictor_id; + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} +std::string replace_name(std::string name, const char *raw, + const char *new_char) { + std::string r_name = name; + int pos = r_name.find(raw); + while (pos >= 0) { + r_name = r_name.replace(pos, 1, new_char); + pos = r_name.find(raw); + } + return r_name; +} + +void DlnneSubgraphPass::CreateDlnneOp( + framework::ir::Node *node, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { + auto *op_desc = node->Op(); + auto &subgraph = *framework::ir::Agent(node).subgraph(); + PADDLE_ENFORCE_EQ(subgraph.empty(), false, + platform::errors::PreconditionNotMet( + "The subgraph should not be empty.")); + + // A fake block desc. + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + LOG(INFO) << "--- detect a sub-graph with " << subgraph.size() << " nodes"; + // for debug + framework::ProgramDesc tmp_dump_program_desc; + auto *tmp_dump_main_block = tmp_dump_program_desc.MutableBlock(0); + + std::unordered_map name_var_desc; + std::set name_var_input_nodes; + std::set name_var_output_nodes; + std::set name_ops; + + for (auto *node : subgraph) { + auto *op = block_desc.AppendOp(); + *op->Proto() = *node->Op()->Proto(); + + // debug + { + name_ops.insert(node->Name()); + auto *tmp_dump_new_block_op = tmp_dump_main_block->AppendOp(); + + framework::OpDesc op_desc; + op_desc.CopyFrom(*node->Op()); + + for (auto argument_name : op_desc.InputArgumentNames()) { + if (std::count(graph_params.begin(), graph_params.end(), + argument_name) > 0) { + op_desc.Rename(argument_name, replace_name(argument_name, "/", ".")); + } + } + for (auto argument_name : op_desc.OutputArgumentNames()) { + if (std::count(graph_params.begin(), graph_params.end(), + argument_name) > 0) { + op_desc.Rename(argument_name, replace_name(argument_name, "/", ".")); + } + } + *tmp_dump_new_block_op->Proto() = *op_desc.Proto(); + + for (auto *x : node->inputs) { + if (x->IsVar()) { + name_var_desc[x->Name()] = x->Var(); + } + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) == + 0) + name_var_input_nodes.insert(x->Name()); + } + + for (auto *x : node->outputs) { + if (x->IsVar()) { + name_var_desc[x->Name()] = x->Var(); + } + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) == + 0) + name_var_output_nodes.insert(x->Name()); + } + } + } + std::set valid_input_names; + std::set valid_output_names; + for (auto name : name_var_output_nodes) { + if (name_var_input_nodes.find(name) == name_var_input_nodes.end()) { + valid_output_names.insert(name); + } + } + + for (auto name : name_var_input_nodes) { + if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) { + valid_input_names.insert(name); + } + } + + // Then, we will use the input_names_with_id and output_names_with_id to + // generate the engine key. + // So, We use set instead of unordered_set here to ensure that the engine key + // is unique. + std::set input_names; + std::set input_names_with_id; + std::vector params; + // if we delete fluid copy of params shared by more than 1 ops, there will be + // problem, so we filter them out. + + // The node->inputs contains input tensors and parameters. + for (auto *x : node->inputs) { + input_names.insert(x->Name()); + input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } + } + + std::set output_names; + std::set output_names_with_id; + std::vector origin_output_dims; + for (auto *x : node->outputs) { + origin_output_dims.push_back(x->Var()->GetShape().size()); + output_names.insert(x->Name()); + output_names_with_id.insert(x->Name() + std::to_string(x->id())); + } + + std::unordered_map output_name_map; + std::unordered_map graph_var_map; + + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + graph_var_map[node->Name()] = node; + } + } + + // Set attrs + op_desc->SetType("dlnne_engine"); + op_desc->SetInput("Xs", std::vector(valid_input_names.begin(), + valid_input_names.end())); + + op_desc->SetOutput("Ys", std::vector(valid_output_names.begin(), + valid_output_names.end())); + + op_desc->SetAttr("parameters", params); + auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, + std::to_string(0)); + op_desc->SetAttr("engine_key", engine_key); + auto *scope = param_scope(); + + { + std::set input_names; + + for (auto name : name_var_input_nodes) { + if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) { + input_names.insert(name); + } + } + + // add feed to subgraph: + int input_idx = 0; + for (auto input_name : input_names) { + auto *feed0 = tmp_dump_main_block->AppendOp(); + feed0->SetType("feed"); + feed0->SetInput("X", {"feed"}); + feed0->SetOutput("Out", {input_name}); + feed0->SetAttr("col", input_idx); + input_idx++; + } + // add fetch to subgraph: + int output_idx = 0; + for (auto output_name : valid_output_names) { + auto *fetch0 = tmp_dump_main_block->AppendOp(); + fetch0->SetType("fetch"); + fetch0->SetInput("X", {output_name}); + fetch0->SetOutput("Out", {"out"}); + fetch0->SetAttr("col", output_idx); + output_idx++; + } + + mkdir("./dump", 0777); + std::string dir_name = "./dump/" + engine_key; + mkdir(dir_name.c_str(), 0777); + ofstream m_stream; + m_stream.open(dir_name + "/__model__", ios::out); + + VLOG(4) << "name_var_desc size:" << name_var_desc.size(); + + for (auto &kv : name_var_desc) { + auto *new_add_var = tmp_dump_main_block->Proto()->add_vars(); + *new_add_var = *kv.second->Proto(); + auto *variable_tmp = scope->FindVar(kv.first); + if (variable_tmp != nullptr) { + *new_add_var->mutable_name() = replace_name(kv.first, "/", "."); + new_add_var->set_persistable(true); + } else { + new_add_var->set_persistable(false); + } + } + + for (auto param_name : params) { + auto *var = scope->FindVar(param_name); + if (var != nullptr) { + auto *var_t = var->GetMutable(); + ofstream p_stream; + p_stream.open(dir_name + "/" + replace_name(param_name, "/", "."), + ios::out); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(var_t->place()); + framework::SerializeToStream(p_stream, *var_t, dev_ctx); + p_stream.close(); + } + } + + std::string model; + + tmp_dump_program_desc.Proto()->SerializeToString(&model); + m_stream << model; + m_stream.close(); + + op_desc->SetBlockAttr("sub_block", tmp_dump_main_block); + op_desc->SetAttr("subgraph", model); + op_desc->Flush(); + + ConvertGraph(engine_key); + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle + +REGISTER_PASS(dlnne_subgraph_pass, + paddle::inference::analysis::DlnneSubgraphPass); diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h new file mode 100644 index 0000000000000..5a1d2506fdb09 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h @@ -0,0 +1,55 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" + +namespace paddle { +namespace framework { +namespace ir { +class Graph; +class Node; +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { + +int ConvertGraph(std::string graph_name); + +namespace analysis { + +class DlnneSubgraphPass : public framework::ir::FusePassBase { + public: + void ApplyImpl(framework::ir::Graph *graph) const override; + + private: + void CleanIntermediateOutputs(framework::ir::Node *node); + void CreateDlnneOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index c697914904b3e..b8cac8992f4ee 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -245,6 +245,11 @@ void LiteSubgraphPass::SetUpEngine( bool use_xpu = Get("use_xpu"); int xpu_l3_workspace_size = Get("xpu_l3_workspace_size"); int cpu_math_library_num_threads = Get("cpu_math_library_num_threads"); + bool locked = Get("locked"); + bool autotune = Get("autotune"); + std::string autotune_file = Get("autotune_file"); + std::string precision = Get("precision"); + bool adaptive_seqlen = Get("adaptive_seqlen"); lite_api::TargetType target_type; if (use_gpu) { @@ -282,6 +287,11 @@ void LiteSubgraphPass::SetUpEngine( }; config.cpu_math_library_num_threads = cpu_math_library_num_threads; config.xpu_l3_workspace_size = xpu_l3_workspace_size; + config.locked = locked; + config.autotune = autotune; + config.autotune_file = autotune_file; + config.precision = precision; + config.adaptive_seqlen = adaptive_seqlen; if (dump_model) { lite::StrToBinaryFile("./model.bin", config.model); lite::StrToBinaryFile("./param.bin", config.param); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 60de4234b41a8..f57f07883dcd7 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" namespace paddle { @@ -321,11 +322,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp( opt_input_shape = {}; } - if (min_input_shape.size() > 0 && TRT_VERSION > 6000) { + auto to_major_version = [&](int full_version) -> float { + return (full_version / 100) / 10.0; + }; + const float compile_time_trt_version = to_major_version(TRT_VERSION); + const float run_time_trt_version = + to_major_version(tensorrt::GetInferLibVersion()); + if (compile_time_trt_version != run_time_trt_version) { LOG_FIRST_N(WARNING, 1) - << "The Paddle lib links the " << TRT_VERSION << " version TensorRT, " - << "make sure the runtime TensorRT you are using is no less than this " - "version, otherwise, there might be Segfault!"; + << "The Paddle Inference library is compiled with " + << compile_time_trt_version << " version TensorRT, " + << "but the runtime TensorRT you are using is " << run_time_trt_version + << " version. " + "This might cause serious compatibility issues. We strongly " + "recommend using the same TRT version at runtime."; } // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 0622fb27d9e38..853c1ac1da874 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -26,6 +26,7 @@ namespace paddle { struct MkldnnQuantizerConfig; extern const std::vector kTRTSubgraphPasses; +extern const std::vector kDlnneSubgraphPasses; extern const std::vector kLiteSubgraphPasses; PassStrategy *AnalysisConfig::pass_builder() const { @@ -95,9 +96,17 @@ void AnalysisConfig::DisableFCPadding() { Update(); } -void AnalysisConfig::EnableXpu(int l3_workspace_size) { +void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked, + bool autotune, const std::string &autotune_file, + const std::string &precision, + bool adaptive_seqlen) { use_xpu_ = true; xpu_l3_workspace_size_ = l3_workspace_size; + xpu_locked_ = locked; + xpu_autotune_ = autotune; + xpu_autotune_file_ = autotune_file; + xpu_precision_ = precision; + xpu_adaptive_seqlen_ = adaptive_seqlen; Update(); } @@ -134,6 +143,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(trt_use_static_engine_); CP_MEMBER(trt_use_calib_mode_); CP_MEMBER(trt_use_oss_); + // Dlnne related + CP_MEMBER(use_dlnne_); + CP_MEMBER(dlnne_min_subgraph_size_); // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -157,6 +169,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_xpu_); CP_MEMBER(xpu_l3_workspace_size_); + CP_MEMBER(xpu_locked_); + CP_MEMBER(xpu_autotune_); + CP_MEMBER(xpu_autotune_file_); + CP_MEMBER(xpu_precision_); + CP_MEMBER(xpu_adaptive_seqlen_); // profile related. CP_MEMBER(with_profile_); @@ -211,6 +228,21 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { pass_builder_->DeletePass(ps); } } + if (use_dlnne_) { + auto all_passes = kDlnneSubgraphPasses; + auto other_passes = other.pass_builder()->AllPasses(); + // We should sort them, because the user may call the SwitchIrDebug + // interface, which will change the pass. + std::sort(all_passes.begin(), all_passes.end()); + std::sort(other_passes.begin(), other_passes.end()); + std::vector deleted_passes; + std::set_difference(all_passes.begin(), all_passes.end(), + other_passes.begin(), other_passes.end(), + std::inserter(deleted_passes, deleted_passes.begin())); + for (auto ps : deleted_passes) { + pass_builder_->DeletePass(ps); + } + } } void AnalysisConfig::EnableCUDNN() { @@ -309,6 +341,12 @@ void AnalysisConfig::EnableTensorRtEngine( #endif } +void AnalysisConfig::EnableDlnne(int min_subgraph_size) { + use_dlnne_ = true; + dlnne_min_subgraph_size_ = min_subgraph_size; + Update(); +} + void AnalysisConfig::SetTRTDynamicShapeInfo( std::map> min_input_shape, std::map> max_input_shape, @@ -383,6 +421,14 @@ void AnalysisConfig::Update() { pass_builder()->AppendPass(pass); } } + LOG(INFO) << "use_dlnne_:" << use_dlnne_ << std::endl; + if (use_dlnne_) { + pass_builder()->ClearPasses(); + for (const auto &pass : kDlnneSubgraphPasses) { + pass_builder()->AppendPass(pass); + } + } + if (use_gpu() && use_cudnn_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!enable_ir_optim_) { @@ -479,6 +525,9 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << tensorrt_max_batchsize_; ss << tensorrt_min_subgraph_size_; + ss << use_dlnne_; + ss << dlnne_min_subgraph_size_; + for (auto &op : trt_disabled_ops_) ss << op.c_str(); ss << ";"; @@ -512,6 +561,11 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << use_lite_; ss << use_xpu_; ss << xpu_l3_workspace_size_; + ss << xpu_locked_; + ss << xpu_autotune_; + ss << xpu_autotune_file_; + ss << xpu_precision_; + ss << xpu_adaptive_seqlen_; ss << thread_local_stream_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 4b6c746d57525..95b0831836843 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -537,6 +537,12 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_); } + if (config_.dlnne_enabled()) { + LOG(INFO) << "Dlnne subgraph is enabled"; + argument_.SetUseDlnne(true); + argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); + } + if (config_.lite_engine_enabled()) { argument_.SetCpuMathLibraryNumThreads( config_.cpu_math_library_num_threads()); @@ -546,6 +552,11 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetLiteZeroCopy(config_.lite_zero_copy_); argument_.SetUseXpu(config_.use_xpu_); argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_); + argument_.SetXpuLocked(config_.xpu_locked_); + argument_.SetXpuAutotune(config_.xpu_autotune_); + argument_.SetXpuAutotuneFile(config_.xpu_autotune_file_); + argument_.SetXpuPrecision(config_.xpu_precision_); + argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); LOG(INFO) << "Lite subgraph engine is enabled"; } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index e492b32cb6cbe..2bbd4bb837a22 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -177,7 +177,10 @@ struct PD_INFER_DECL AnalysisConfig { /// void DisableGpu(); - void EnableXpu(int l3_workspace_size = 0xfffc00); + void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false, + bool autotune = true, const std::string& autotune_file = "", + const std::string& precision = "int16", + bool adaptive_seqlen = false); /// /// \brief A boolean state telling whether the GPU is turned on. /// @@ -360,6 +363,9 @@ struct PD_INFER_DECL AnalysisConfig { /// bool tensorrt_dla_enabled() { return trt_use_dla_; } + void EnableDlnne(int min_subgraph_size = 3); + bool dlnne_enabled() const { return use_dlnne_; } + /// /// \brief Turn on the usage of Lite sub-graph engine. /// @@ -627,6 +633,10 @@ struct PD_INFER_DECL AnalysisConfig { std::vector trt_disabled_ops_{}; bool disable_trt_plugin_fp16_{false}; + // dlnne related. + bool use_dlnne_{false}; + int dlnne_min_subgraph_size_{3}; + // memory reuse related. bool enable_memory_optim_{false}; @@ -661,6 +671,11 @@ struct PD_INFER_DECL AnalysisConfig { bool thread_local_stream_{false}; bool use_xpu_{false}; int xpu_l3_workspace_size_; + bool xpu_locked_; + bool xpu_autotune_; + std::string xpu_autotune_file_; + std::string xpu_precision_; + bool xpu_adaptive_seqlen_; // mkldnn related. int mkldnn_cache_capacity_{0}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 1d77ddaf73ef7..2b7333edae0da 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -110,6 +110,15 @@ const std::vector kTRTSubgraphPasses({ "transpose_flatten_concat_fuse_pass", }); +const std::vector kDlnneSubgraphPasses({ + "is_test_pass", // + "simplify_with_basic_ops_pass", // + "conv_bn_fuse_pass", // + "depthwise_conv_bn_fuse_pass", // + "shuffle_channel_detect_pass", // + "dlnne_subgraph_pass", // +}); + const std::vector kLiteSubgraphPasses({ #ifdef PADDLE_WITH_LITE "lite_subgraph_pass", diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index a725ebab35ead..d7556b50031b7 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -242,6 +242,9 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy { /// \brief List of tensorRT subgraph passes. PD_INFER_DECL extern const std::vector kTRTSubgraphPasses; +/// \brief List of dlnne subgraph passes. +PD_INFER_DECL extern const std::vector kDlnneSubgraphPasses; + /// \brief List of lite subgraph passes. PD_INFER_DECL extern const std::vector kLiteSubgraphPasses; diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc index 231639667244d..9bb52ba578025 100644 --- a/paddle/fluid/inference/capi/pd_config.cc +++ b/paddle/fluid/inference/capi/pd_config.cc @@ -260,6 +260,22 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) { return config->config.tensorrt_engine_enabled(); } +void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) { + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); + config->config.EnableDlnne(min_subgraph_size); +} + +bool PD_DlnneEnabled(const PD_AnalysisConfig* config) { + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); + return config->config.dlnne_enabled(); +} + void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) { PADDLE_ENFORCE_NOT_NULL( config, diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt new file mode 100644 index 0000000000000..521d24329d464 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt @@ -0,0 +1,29 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc pd_utils.cc) + +cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference) + +if(NOT ON_INFER) + return() +endif() + +# Create inference capi shared library +cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference) +set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c) +if(WIN32) + target_link_libraries(paddle_inference_c_shared shlwapi.lib) +endif() diff --git a/paddle/fluid/inference/capi_exp/lod_demo.cc b/paddle/fluid/inference/capi_exp/lod_demo.cc new file mode 100644 index 0000000000000..2b049e992e71d --- /dev/null +++ b/paddle/fluid/inference/capi_exp/lod_demo.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file lod_demo.cc +/// +/// \brief a demo for user to learn how to inference by c api. +/// it rectify from +/// paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc. +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" + +int main(int argc, char *argv[]) { + auto model_dir = FLAGS_infer_model; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/param").c_str()); + PD_ConfigDisableGpu(config); + + PD_Predictor *predictor = PD_PredictorCreate(config); + size_t input_num = PD_PredictorGetInputNum(predictor); + size_t output_num = PD_PredictorGetOutputNum(predictor); + + PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor); + LOG(INFO) << "Predictor start run!"; + PD_Tensor *inputs[2]; + inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]); + inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]); + LOG(INFO) << "Predictor start run!"; + // inputs[0]: word, use lod memory in stack + int32_t shape_0[2] = {11, 1}; + int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9}; + size_t lod_layer_0[2] = {0, 11}; + PD_OneDimArraySize layer_0; + layer_0.size = 2; + layer_0.data = lod_layer_0; + PD_OneDimArraySize *layer_0_ptr = &layer_0; + PD_TwoDimArraySize lod_0; + lod_0.size = 1; + lod_0.data = &layer_0_ptr; + PD_TensorReshape(inputs[0], 2, shape_0); + PD_TensorCopyFromCpuInt64(inputs[0], data_0); + PD_TensorSetLod(inputs[0], &lod_0); + + // inputs[1]: mention, use lod memory in heap + int32_t shape_1[2] = {11, 1}; + int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2}; + PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize(); + lod_1_ptr->size = 1; + lod_1_ptr->data = new PD_OneDimArraySize *[1]; + lod_1_ptr->data[0] = new PD_OneDimArraySize(); + lod_1_ptr->data[0]->size = 2; + lod_1_ptr->data[0]->data = new size_t[2]; + lod_1_ptr->data[0]->data[0] = 0; + lod_1_ptr->data[0]->data[1] = 11; + + PD_TensorReshape(inputs[1], 2, shape_1); + PD_TensorCopyFromCpuInt64(inputs[1], data_1); + PD_TensorSetLod(inputs[1], lod_1_ptr); + // retrieve the lod memory + delete[] lod_1_ptr->data[0]->data; + delete lod_1_ptr->data[0]; + delete[] lod_1_ptr->data; + delete lod_1_ptr; + lod_1_ptr = nullptr; + + PD_PredictorRun(predictor); + PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor *output = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output); + + PD_TwoDimArraySizeDestroy(output_lod); + PD_TensorDestroy(output); + PD_OneDimArrayCstrDestroy(output_names); + + PD_TensorDestroy(inputs[0]); + PD_TensorDestroy(inputs[1]); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} diff --git a/paddle/fluid/inference/capi_exp/pd_common.h b/paddle/fluid/inference/capi_exp/pd_common.h new file mode 100644 index 0000000000000..4b70ed7fbad29 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_common.h @@ -0,0 +1,75 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#if defined(_WIN32) +#ifdef PADDLE_DLL_INFERENCE +#define PADDLE_CAPI_EXPORT __declspec(dllexport) +#else +#define PADDLE_CAPI_EXPORT __declspec(dllimport) +#endif // PADDLE_DLL_INFERENCE +#else +#define PADDLE_CAPI_EXPORT __attribute__((visibility("default"))) +#endif // _WIN32 + +/// +/// __pd_give means that a new object is returned. The user should make sure +/// that the returned pointer is used exactly once as a value for an __pd_take +/// argument. In between, it can be used as a value for as many __pd_keep +/// arguments as the user likes. +/// +#ifndef __pd_give +#define __pd_give +#endif +/// +/// __pd_take means that the object the argument points to is taken over by the +/// function and may no longer be used by the user as an argument to any other +/// function. The pointer value must be one returned by a function returning an +/// __pd_give pointer. +/// +#ifndef __pd_take +#define __pd_take +#endif +/// +/// __pd_keep means that the function will only use the object temporarily. The +/// object which the argument points to is not taken over by the function. After +/// the function has finished, the user can still use it as an argument to other +/// functions. +/// +#ifndef __pd_keep +#define __pd_keep +#endif + +typedef int8_t PD_Bool; +#define TRUE 1 +#define FALSE 0 + +#define PD_ENUM(type) \ + typedef int32_t type; \ + enum + +PD_ENUM(PD_PrecisionType){PD_PRECISION_FLOAT32 = 0, PD_PRECISION_INT8, + PD_PRECISION_HALF}; + +PD_ENUM(PD_PlaceType){PD_PLACE_UNK = -1, PD_PLACE_CPU, PD_PLACE_GPU, + PD_PLACE_XPU}; + +PD_ENUM(PD_DataType){ + PD_DATA_UNK = -1, PD_DATA_FLOAT32, PD_DATA_INT32, + PD_DATA_INT64, PD_DATA_UINT8, +}; diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc new file mode 100644 index 0000000000000..c45454e86bdaa --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -0,0 +1,382 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/capi_exp/pd_config.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +#define CHECK_NULL_POINTER_PARM(param) \ + PADDLE_ENFORCE_NOT_NULL( \ + param, paddle::platform::errors::InvalidArgument( \ + "The pointer of " #param " shouldn't be nullptr")) + +#define CHECK_AND_CONVERT_PD_CONFIG \ + PADDLE_ENFORCE_NOT_NULL( \ + pd_config, paddle::platform::errors::InvalidArgument( \ + "The pointer of paddle config shouldn't be nullptr")); \ + Config* config = reinterpret_cast(pd_config) + +using paddle_infer::Config; + +static Config::Precision ConvertToCxxPrecisionType(PD_PrecisionType precision) { + switch (precision) { + case PD_PRECISION_FLOAT32: + return Config::Precision::kFloat32; + case PD_PRECISION_INT8: + return Config::Precision::kInt8; + case PD_PRECISION_HALF: + return Config::Precision::kHalf; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unsupport paddle precision type %d.", precision)); + return Config::Precision::kFloat32; + } +} + +extern "C" { +__pd_give PD_Config* PD_ConfigCreate() { + return reinterpret_cast(new Config()); +} + +void PD_ConfigDestroy(__pd_take PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + delete reinterpret_cast(config); +} + +void PD_ConfigSetModel(__pd_keep PD_Config* pd_config, + const char* prog_file_path, + const char* params_file_path) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(prog_file_path); + CHECK_NULL_POINTER_PARM(params_file_path); + config->SetModel(prog_file_path, params_file_path); +} +void PD_ConfigSetProgFile(__pd_keep PD_Config* pd_config, + const char* prog_file_path) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(prog_file_path); + config->SetProgFile(prog_file_path); +} +void PD_ConfigSetParamsFile(__pd_keep PD_Config* pd_config, + const char* params_file_path) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(params_file_path); + config->SetParamsFile(params_file_path); +} +void PD_ConfigSetOptimCacheDir(__pd_keep PD_Config* pd_config, + const char* opt_cache_dir) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(opt_cache_dir); + config->SetOptimCacheDir(opt_cache_dir); +} + +void PD_ConfigSetModelDir(__pd_keep PD_Config* pd_config, + const char* model_dir) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(model_dir); + config->SetModel(model_dir); +} +const char* PD_ConfigGetModelDir(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->model_dir().c_str(); +} +const char* PD_ConfigGetProgFile(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->prog_file().c_str(); +} +const char* PD_ConfigGetParamsFile(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->params_file().c_str(); +} + +void PD_ConfigDisableFCPadding(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableFCPadding(); +} +PD_Bool PD_ConfigUseFcPadding(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_fc_padding(); +} + +void PD_ConfigEnableUseGpu(__pd_keep PD_Config* pd_config, + uint64_t memory_pool_init_size_mb, + int32_t device_id) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableUseGpu(memory_pool_init_size_mb, device_id); +} +void PD_ConfigDisableGpu(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableGpu(); +} +PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_gpu(); +} + +void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, + int32_t l3_workspace_size) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableXpu(l3_workspace_size); +} +PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_xpu(); +} + +int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->gpu_device_id(); +} +int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->xpu_device_id(); +} +int32_t PD_ConfigMemoryPoolInitSizeMb(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->memory_pool_init_size_mb(); +} +float PD_ConfigFractionOfGpuMemoryForPool(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->fraction_of_gpu_memory_for_pool(); +} +void PD_ConfigEnableCudnn(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableCUDNN(); +} +PD_Bool PD_ConfigCudnnEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->cudnn_enabled(); +} + +void PD_ConfigSwitchIrOptim(__pd_keep PD_Config* pd_config, PD_Bool x) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SwitchIrOptim(x); +} +PD_Bool PD_ConfigIrOptim(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->ir_optim(); +} + +void PD_ConfigEnableTensorRtEngine(__pd_keep PD_Config* pd_config, + int32_t workspace_size, + int32_t max_batch_size, + int32_t min_subgraph_size, + PD_PrecisionType precision, + PD_Bool use_static, PD_Bool use_calib_mode) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableTensorRtEngine( + workspace_size, max_batch_size, min_subgraph_size, + ConvertToCxxPrecisionType(precision), use_static, use_calib_mode); +} +PD_Bool PD_ConfigTensorRtEngineEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->tensorrt_engine_enabled(); +} + +void PD_ConfigSetTrtDynamicShapeInfo(__pd_keep PD_Config* pd_config, + size_t tensor_num, + const char** tensor_name, + size_t* shapes_num, int32_t** min_shape, + int32_t** max_shape, int32_t** optim_shape, + PD_Bool disable_trt_plugin_fp16) { + CHECK_AND_CONVERT_PD_CONFIG; + std::map> min_input_shapes; + std::map> max_input_shapes; + std::map> optim_input_shapes; + for (size_t tensor_index = 0; tensor_index < tensor_num; ++tensor_index) { + std::string name(tensor_name[tensor_index]); + std::vector min_input_shape, max_input_shape, optim_input_shape; + for (size_t shape_index = 0; shape_index < shapes_num[tensor_index]; + ++shape_index) { + min_input_shape.emplace_back(min_shape[tensor_index][shape_index]); + max_input_shape.emplace_back(max_shape[tensor_index][shape_index]); + optim_input_shape.emplace_back(optim_shape[tensor_index][shape_index]); + } + min_input_shapes[name] = std::move(min_input_shape); + max_input_shapes[name] = std::move(max_input_shape); + optim_input_shapes[name] = std::move(optim_input_shape); + } + config->SetTRTDynamicShapeInfo(min_input_shapes, max_input_shapes, + optim_input_shapes, disable_trt_plugin_fp16); +} + +void PD_ConfigDisableTensorRtOPs(__pd_keep PD_Config* pd_config, size_t ops_num, + const char** ops_name) { + CHECK_AND_CONVERT_PD_CONFIG; + std::vector ops_list; + for (size_t index = 0; index < ops_num; ++index) { + ops_list.emplace_back(ops_name[index]); + } + config->Exp_DisableTensorRtOPs(ops_list); +} + +void PD_ConfigEnableTensorRtOSS(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableTensorRtOSS(); +} +PD_Bool PD_ConfigTensorRtOssEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->tensorrt_oss_enabled(); +} + +void PD_ConfigEnableTensorRtDla(__pd_keep PD_Config* pd_config, + int32_t dla_core) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableTensorRtDLA(dla_core); +} +PD_Bool PD_ConfigTensorRtDlaEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->tensorrt_dla_enabled(); +} + +void PD_ConfigEnableLiteEngine(__pd_keep PD_Config* pd_config, + PD_PrecisionType precision, PD_Bool zero_copy, + size_t passes_filter_num, + const char** passes_filter, + size_t ops_filter_num, const char** ops_filter) { + CHECK_AND_CONVERT_PD_CONFIG; + std::vector passes_filters, ops_filters; + for (size_t index = 0; index < passes_filter_num; ++index) { + passes_filters.emplace_back(passes_filter[index]); + } + for (size_t index = 0; index < ops_filter_num; ++index) { + ops_filters.emplace_back(ops_filter[index]); + } + config->EnableLiteEngine(ConvertToCxxPrecisionType(precision), zero_copy, + passes_filters, ops_filters); +} +PD_Bool PD_ConfigLiteEngineEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->lite_engine_enabled(); +} + +void PD_ConfigSwitchIrDebug(__pd_keep PD_Config* pd_config, PD_Bool x) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SwitchIrDebug(x); +} +void PD_ConfigEnableMKLDNN(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMKLDNN(); +} +void PD_ConfigSetMkldnnCacheCapacity(__pd_keep PD_Config* pd_config, + int32_t capacity) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetMkldnnCacheCapacity(capacity); +} +PD_Bool PD_ConfigMkldnnEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->mkldnn_enabled(); +} +void PD_ConfigSetCpuMathLibraryNumThreads( + __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetCpuMathLibraryNumThreads(cpu_math_library_num_threads); +} +int32_t PD_ConfigGetCpuMathLibraryNumThreads(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->cpu_math_library_num_threads(); +} + +void PD_ConfigSetMkldnnOp(__pd_keep PD_Config* pd_config, size_t ops_num, + const char** op_list) { + CHECK_AND_CONVERT_PD_CONFIG; + std::unordered_set op_names; + for (size_t index = 0; index < ops_num; ++index) { + op_names.emplace(op_list[index]); + } + config->SetMKLDNNOp(std::move(op_names)); +} +void PD_ConfigEnableMkldnnQuantizer(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMkldnnQuantizer(); +} +void PD_ConfigEnableMkldnnBfloat16(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMkldnnBfloat16(); +} +PD_Bool PD_ConfigMkldnnBfloat16Enabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->mkldnn_bfloat16_enabled(); +} +void PD_ConfigSetBfloat16Op(__pd_keep PD_Config* pd_config, size_t ops_num, + const char** op_list) { + CHECK_AND_CONVERT_PD_CONFIG; + std::unordered_set op_names; + for (size_t index = 0; index < ops_num; ++index) { + op_names.emplace(op_list[index]); + } + config->SetBfloat16Op(std::move(op_names)); +} +PD_Bool PD_ConfigThreadLocalStreamEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->thread_local_stream_enabled(); +} +PD_Bool PD_ConfigMkldnnQuantizerEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->mkldnn_quantizer_enabled(); +} +void PD_ConfigSetModelBuffer(__pd_keep PD_Config* pd_config, + const char* prog_buffer, size_t prog_buffer_size, + const char* params_buffer, + size_t params_buffer_size) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer, + params_buffer_size); +} +PD_Bool PD_ConfigModelFromMemory(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->model_from_memory(); +} +void PD_ConfigEnableMemoryOptim(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMemoryOptim(); +} +PD_Bool PD_ConfigMemoryOptimEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->enable_memory_optim(); +} +void PD_ConfigEnableProfile(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableProfile(); +} +PD_Bool PD_ConfigProfileEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->profile_enabled(); +} +void PD_ConfigDisableGlogInfo(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableGlogInfo(); +} +PD_Bool PD_ConfigGlogInfoDisabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->glog_info_disabled(); +} +void PD_ConfigSetInvalid(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetInValid(); +} +PD_Bool PD_ConfigIsValid(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->is_valid(); +} +void PD_ConfigEnableGpuMultiStream(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableGpuMultiStream(); +} +void PD_ConfigPartiallyRelease(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->PartiallyRelease(); +} + +} // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h new file mode 100644 index 0000000000000..e44983e24484e --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -0,0 +1,571 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_config.h +/// +/// \brief interface for paddle config +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include "pd_common.h" // NOLINT + +typedef struct PD_Config PD_Config; + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Create a paddle config +/// +/// \return new config. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Config* PD_ConfigCreate(); +/// +/// \brief Destroy the paddle config +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDestroy(__pd_take PD_Config* pd_config); +/// +/// \brief Set the combined model with two specific pathes for program and +/// parameters. +/// +/// \param[in] pd_onfig config +/// \param[in] prog_file_path model file path of the combined model. +/// \param[in] params_file_path params file path of the combined model. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetModel(__pd_keep PD_Config* pd_config, + const char* prog_file_path, + const char* params_file_path); +/// +/// \brief Set the model file path of a combined model. +/// +/// \param[in] pd_onfig config +/// \param[in] prog_file_path model file path. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetProgFile( + __pd_keep PD_Config* pd_config, const char* prog_file_path); +/// +/// \brief Set the params file path of a combined model. +/// +/// \param[in] pd_onfig config +/// \param[in] params_file_path params file path. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetParamsFile( + __pd_keep PD_Config* pd_config, const char* params_file_path); +/// +/// \brief Set the path of optimization cache directory. +/// \param[in] pd_onfig config +/// \param[in] opt_cache_dir the path of optimization cache directory. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetOptimCacheDir( + __pd_keep PD_Config* pd_config, const char* opt_cache_dir); +/// +/// \brief Set the no-combined model dir path. +/// \param[in] pd_onfig config +/// \param[in] model_dir model dir path. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelDir( + __pd_keep PD_Config* pd_config, const char* model_dir); +/// +/// \brief Get the model directory path. +/// +/// \param[in] pd_onfig config +/// \return The model directory path. +/// +PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetModelDir( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the program file path. +/// +/// \param[in] pd_onfig config +/// \return The program file path. +/// +PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetProgFile( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the params file path. +/// +/// \param[in] pd_onfig config +/// \return The params file path. +/// +PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetParamsFile( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn off FC Padding. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableFCPadding( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether fc padding is used. +/// +/// \param[in] pd_onfig config +/// \return Whether fc padding is used. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseFcPadding( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on GPU. +/// +/// \param[in] pd_onfig config +/// \param[in] memory_pool_init_size_mb initial size of the GPU memory pool in +/// MB. +/// \param[in] device_id device_id the GPU card to use. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableUseGpu( + __pd_keep PD_Config* pd_config, uint64_t memory_pool_init_size_mb, + int32_t device_id); +/// +/// \brief Turn off GPU. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the GPU is turned on. +/// +/// \brief Turn off GPU. +/// \return Whether the GPU is turned on. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on XPU. +/// +/// \param[in] pd_onfig config +/// \param[in] l3_workspace_size l3 workspace size. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( + __pd_keep PD_Config* pd_config, int32_t l3_workspace_size); +/// +/// \brief A boolean state telling whether the XPU is turned on. +/// +/// \param[in] pd_onfig config +/// \return Whether the XPU is turned on. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the GPU device id. +/// +/// \param[in] pd_onfig config +/// \return The GPU device id. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the XPU device id. +/// +/// \param[in] pd_onfig config +/// \return The XPU device id. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the initial size in MB of the GPU memory pool. +/// +/// \param[in] pd_onfig config +/// \return The initial size in MB of the GPU memory pool. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigMemoryPoolInitSizeMb( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the proportion of the initial memory pool size compared to the +/// device. +/// +/// \param[in] pd_onfig config +/// \return The proportion of the initial memory pool size. +/// +PADDLE_CAPI_EXPORT extern float PD_ConfigFractionOfGpuMemoryForPool( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on CUDNN. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableCudnn( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether to use CUDNN. +/// +/// \param[in] pd_onfig config +/// \return Whether to use CUDNN. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigCudnnEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Control whether to perform IR graph optimization. +/// If turned off, the AnalysisConfig will act just like a NativeConfig. +/// +/// \param[in] pd_onfig config +/// \param[in] x Whether the ir graph optimization is actived. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrOptim( + __pd_keep PD_Config* pd_config, PD_Bool x); +/// +/// \brief A boolean state telling whether the ir graph optimization is +/// actived. +/// +/// \param[in] pd_onfig config +/// \return Whether to use ir graph optimization. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on the TensorRT engine. +/// The TensorRT engine will accelerate some subgraphes in the original Fluid +/// computation graph. In some models such as resnet50, GoogleNet and so on, +/// it gains significant performance acceleration. +/// +/// \param[in] pd_onfig config +/// \param[in] workspace_size The memory size(in byte) used for TensorRT +/// workspace. +/// \param[in] max_batch_size The maximum batch size of this prediction task, +/// better set as small as possible for less performance loss. +/// \param[in] min_subgrpah_size The minimum TensorRT subgraph size needed, if a +/// subgraph is smaller than this, it will not be transferred to TensorRT +/// engine. +/// \param[in] precision The precision used in TensorRT. +/// \param[in] use_static Serialize optimization information to disk for +/// reusing. +/// \param[in] use_calib_mode Use TRT int8 calibration(post training +/// quantization). +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtEngine( + __pd_keep PD_Config* pd_config, int32_t workspace_size, + int32_t max_batch_size, int32_t min_subgraph_size, + PD_PrecisionType precision, PD_Bool use_static, PD_Bool use_calib_mode); +/// +/// \brief A boolean state telling whether the TensorRT engine is used. +/// +/// \param[in] pd_onfig config +/// \return Whether the TensorRT engine is used. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtEngineEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set min, max, opt shape for TensorRT Dynamic shape mode. +/// +/// \param[in] pd_onfig config +/// \param[in] tensor_num The number of the subgraph input. +/// \param[in] tensor_name The name of every subgraph input. +/// \param[in] shapes_num The shape size of every subgraph input. +/// \param[in] min_shape The min input shape of every subgraph input. +/// \param[in] max_shape The max input shape of every subgraph input. +/// \param[in] optim_shape The opt input shape of every subgraph input. +/// \param[in] disable_trt_plugin_fp16 Setting this parameter to true means that +/// TRT plugin will not run fp16. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetTrtDynamicShapeInfo( + __pd_keep PD_Config* pd_config, size_t tensor_num, const char** tensor_name, + size_t* shapes_num, int32_t** min_shape, int32_t** max_shape, + int32_t** optim_shape, PD_Bool disable_trt_plugin_fp16); +/// +/// \brief Prevent ops running in Paddle-TRT +/// NOTE: just experimental, not an official stable API, easy to be broken. +/// +/// \param[in] pd_onfig config +/// \param[in] ops_num ops number +/// \param[in] ops_name ops name +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableTensorRtOPs( + __pd_keep PD_Config* pd_config, size_t ops_num, const char** ops_name); +/// +/// \brief Replace some TensorRT plugins to TensorRT OSS( +/// https://github.com/NVIDIA/TensorRT), with which some models's inference +/// may be more high-performance. Libnvinfer_plugin.so greater than +/// V7.2.1 is needed. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtOSS( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether to use the TensorRT OSS. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the TensorRT OSS. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtOssEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Enable TensorRT DLA +/// +/// \param[in] pd_onfig config +/// \param[in] dla_core ID of DLACore, which should be 0, 1, +/// ..., IBuilder.getNbDLACores() - 1 +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtDla( + __pd_keep PD_Config* pd_config, int32_t dla_core); +/// +/// \brief A boolean state telling whether to use the TensorRT DLA. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the TensorRT DLA. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtDlaEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on the usage of Lite sub-graph engine. +/// +/// \param[in] pd_onfig config +/// \param[in] precision Precion used in Lite sub-graph engine. +/// \param[in] zero_copy whether use zero copy. +/// \param[in] passes_filter_num The number of passes used in Lite sub-graph +/// engine. +/// \param[in] passes_filter The name of passes used in Lite sub-graph engine. +/// \param[in] ops_filter_num The number of operators not supported by Lite. +/// \param[in] ops_filter The name of operators not supported by Lite. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableLiteEngine( + __pd_keep PD_Config* pd_config, PD_PrecisionType precision, + PD_Bool zero_copy, size_t passes_filter_num, const char** passes_filter, + size_t ops_filter_num, const char** ops_filter); +/// +/// \brief A boolean state indicating whether the Lite sub-graph engine is +/// used. +/// +/// \param[in] pd_onfig config +/// \return Whether the Lite sub-graph engine is used. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigLiteEngineEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Control whether to debug IR graph analysis phase. +/// This will generate DOT files for visualizing the computation graph after +/// each analysis pass applied. +/// +/// \param[in] pd_onfig config +/// \param[in] x whether to debug IR graph analysis phase. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrDebug( + __pd_keep PD_Config* pd_config, PD_Bool x); +/// +/// \brief Turn on MKLDNN. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMKLDNN( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set the cache capacity of different input shapes for MKLDNN. +/// Default value 0 means not caching any shape. +/// Please see MKL-DNN Data Caching Design Document: +/// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md +/// +/// \param[in] pd_onfig config +/// \param[in] capacity The cache capacity. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnCacheCapacity( + __pd_keep PD_Config* pd_config, int32_t capacity); +/// +/// \brief A boolean state telling whether to use the MKLDNN. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the MKLDNN. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set the number of cpu math library threads. +/// +/// \param[in] pd_onfig config +/// \param cpu_math_library_num_threads The number of cpu math library +/// threads. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetCpuMathLibraryNumThreads( + __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads); +/// +/// \brief An int state telling how many threads are used in the CPU math +/// library. +/// +/// \param[in] pd_onfig config +/// \return The number of threads used in the CPU math library. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGetCpuMathLibraryNumThreads( + __pd_keep PD_Config* pd_config); +/// +/// \brief Specify the operator type list to use MKLDNN acceleration. +/// +/// \param[in] pd_onfig config +/// \param[in] ops_num The number of operator type list. +/// \param[in] op_list The name of operator type list. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnOp( + __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list); +/// +/// \brief Turn on MKLDNN quantization. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnQuantizer( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the MKLDNN quantization is enabled. +/// +/// \param[in] pd_onfig config +/// \return Whether the MKLDNN quantization is enabled. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnQuantizerEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on MKLDNN bfloat16. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnBfloat16( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether to use the MKLDNN Bfloat16. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the MKLDNN Bfloat16. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnBfloat16Enabled( + __pd_keep PD_Config* pd_config); +/// \brief Specify the operator type list to use Bfloat16 acceleration. +/// +/// \param[in] pd_onfig config +/// \param[in] ops_num The number of operator type list. +/// \param[in] op_list The name of operator type list. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetBfloat16Op( + __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list); +/// +/// \brief Enable the GPU multi-computing stream feature. +/// NOTE: The current behavior of this interface is to bind the computation +/// stream to the thread, and this behavior may be changed in the future. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableGpuMultiStream( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the thread local CUDA stream is +/// enabled. +/// +/// \param[in] pd_onfig config +/// \return Whether the thread local CUDA stream is enabled. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigThreadLocalStreamEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Specify the memory buffer of program and parameter. +/// Used when model and params are loaded directly from memory. +/// +/// \param[in] pd_onfig config +/// \param[in] prog_buffer The memory buffer of program. +/// \param[in] prog_buffer_size The size of the model data. +/// \param[in] params_buffer The memory buffer of the combined parameters file. +/// \param[in] params_buffer_size The size of the combined parameters data. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelBuffer( + __pd_keep PD_Config* pd_config, const char* prog_buffer, + size_t prog_buffer_size, const char* params_buffer, + size_t params_buffer_size); +/// +/// \brief A boolean state telling whether the model is set from the CPU +/// memory. +/// +/// \param[in] pd_onfig config +/// \return Whether model and params are loaded directly from memory. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigModelFromMemory( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on memory optimize +/// NOTE still in development. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMemoryOptim( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the memory optimization is +/// activated. +/// +/// \param[in] pd_onfig config +/// \return Whether the memory optimization is activated. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMemoryOptimEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on profiling report. +/// If not turned on, no profiling report will be generated. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableProfile( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the profiler is activated. +/// +/// \param[in] pd_onfig config +/// \return bool Whether the profiler is activated. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigProfileEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Mute all logs in Paddle inference. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGlogInfo( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether logs in Paddle inference are muted. +/// +/// \param[in] pd_onfig config +/// \return Whether logs in Paddle inference are muted. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigGlogInfoDisabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set the Config to be invalid. +/// This is to ensure that an Config can only be used in one +/// Predictor. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetInvalid( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the Config is valid. +/// +/// \param[in] pd_onfig config +/// \return Whether the Config is valid. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIsValid( + __pd_keep PD_Config* pd_config); +/// +/// \brief Partially release the memory +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigPartiallyRelease( + __pd_keep PD_Config* pd_config); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/inference/capi_exp/pd_inference_api.h similarity index 58% rename from paddle/fluid/operators/distributed/distributed_pb.h rename to paddle/fluid/inference/capi_exp/pd_inference_api.h index f1c662be9af67..5f21dca1a7bf6 100644 --- a/paddle/fluid/operators/distributed/distributed_pb.h +++ b/paddle/fluid/inference/capi_exp/pd_inference_api.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,17 +14,9 @@ #pragma once -#ifdef PADDLE_WITH_DISTRIBUTE - -#ifdef PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -#else // PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -#endif // PADDLE_WITH_GRPC - -#endif // PADDLE_WITH_DISTRIBUTE +#include "pd_common.h" // NOLINT +#include "pd_config.h" // NOLINT +#include "pd_predictor.h" // NOLINT +#include "pd_tensor.h" // NOLINT +#include "pd_types.h" // NOLINT +#include "pd_utils.h" // NOLINT diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc new file mode 100644 index 0000000000000..f5287a5152957 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/capi_exp/pd_predictor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_types.h" +#include "paddle/fluid/inference/capi_exp/pd_utils.h" +#include "paddle/fluid/inference/capi_exp/types_internal.h" +#include "paddle/fluid/inference/capi_exp/utils_internal.h" +#include "paddle/fluid/platform/enforce.h" + +#define CHECK_AND_CONVERT_PD_PREDICTOR \ + PADDLE_ENFORCE_NOT_NULL( \ + pd_predictor, \ + paddle::platform::errors::InvalidArgument( \ + "The pointer of paddle predictor shouldn't be nullptr")); \ + auto& predictor = pd_predictor->predictor + +extern "C" { +__pd_give PD_Predictor* PD_PredictorCreate(__pd_take PD_Config* pd_config) { + PADDLE_ENFORCE_NOT_NULL( + pd_config, paddle::platform::errors::InvalidArgument( + "The pointer of paddle predictor shouldn't be nullptr")); + PD_Predictor* pd_predictor = new PD_Predictor(); + paddle_infer::Config* config = + reinterpret_cast(pd_config); + pd_predictor->predictor = paddle_infer::CreatePredictor(*config); + delete config; + return pd_predictor; +} + +__pd_give PD_Predictor* PD_PredictorClone( + __pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + PD_Predictor* new_predictor = new PD_Predictor(); + new_predictor->predictor = predictor->Clone(); + return new_predictor; +} + +__pd_give PD_OneDimArrayCstr* PD_PredictorGetInputNames( + __pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + std::vector names = predictor->GetInputNames(); + return paddle_infer::CvtVecToOneDimArrayCstr(names); +} + +__pd_give PD_OneDimArrayCstr* PD_PredictorGetOutputNames( + __pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + std::vector names = predictor->GetOutputNames(); + return paddle_infer::CvtVecToOneDimArrayCstr(names); +} + +size_t PD_PredictorGetInputNum(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->GetInputNames().size(); +} + +size_t PD_PredictorGetOutputNum(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->GetOutputNames().size(); +} +__pd_give PD_Tensor* PD_PredictorGetInputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name) { + CHECK_AND_CONVERT_PD_PREDICTOR; + PD_Tensor* pd_tensor = new PD_Tensor(); + pd_tensor->tensor = predictor->GetInputHandle(name); + return pd_tensor; +} + +__pd_give PD_Tensor* PD_PredictorGetOutputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name) { + CHECK_AND_CONVERT_PD_PREDICTOR; + PD_Tensor* pd_tensor = new PD_Tensor(); + pd_tensor->tensor = predictor->GetOutputHandle(name); + return pd_tensor; +} + +PD_Bool PD_PredictorRun(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->Run(); +} + +void PD_PredictorClearIntermediateTensor(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + predictor->ClearIntermediateTensor(); +} + +uint64_t PD_PredictorTryShrinkMemory(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->TryShrinkMemory(); +} + +void PD_PredictorDestroy(__pd_take PD_Predictor* pd_predictor) { + delete pd_predictor; +} + +} // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.h b/paddle/fluid/inference/capi_exp/pd_predictor.h new file mode 100644 index 0000000000000..d4542d0b6d394 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_predictor.h @@ -0,0 +1,148 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_predictor.h +/// +/// \brief interface for paddle predictor +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include "pd_common.h" // NOLINT + +typedef struct PD_Predictor PD_Predictor; +typedef struct PD_Config PD_Config; +typedef struct PD_Tensor PD_Tensor; +typedef struct PD_OneDimArrayCstr PD_OneDimArrayCstr; + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Create a new Predictor +/// +/// \param[in] Config config +/// \return new predicor. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorCreate( + __pd_take PD_Config* pd_config); +/// +/// \brief Clone a new Predictor +/// +/// \param[in] pd_predictor predictor +/// \return new predictor. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorClone( + __pd_keep PD_Predictor* pd_predictor); +/// +/// \brief Get the input names +/// +/// \param[in] pd_predictor predictor +/// \return input names +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr* +PD_PredictorGetInputNames(__pd_keep PD_Predictor* pd_predictor); +/// +/// \brief Get the output names +/// +/// \param[in] pd_predictor predictor +/// \return output names +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr* +PD_PredictorGetOutputNames(__pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Get the input number +/// +/// \param[in] pd_predictor predictor +/// \return input number +/// +PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetInputNum( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Get the output number +/// +/// \param[in] pd_predictor predictor +/// \return output number +/// +PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetOutputNum( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Get the Input Tensor object +/// +/// \param[in] pd_predictor predictor +/// \param[in] name input name +/// \return input tensor +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetInputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name); + +/// +/// \brief Get the Output Tensor object +/// +/// \param[in] pd_predictor predictor +/// \param[in] name output name +/// \return output tensor +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetOutputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name); + +/// +/// \brief Run the prediction engine +/// +/// \param[in] pd_predictor predictor +/// \return Whether the function executed successfully +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_PredictorRun( + __pd_keep PD_Predictor* pd_predictor); + +/// \brief Clear the intermediate tensors of the predictor +/// +/// \param[in] pd_predictor predictor +/// +PADDLE_CAPI_EXPORT extern void PD_PredictorClearIntermediateTensor( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Release all tmp tensor to compress the size of the memory pool. +/// The memory pool is considered to be composed of a list of chunks, if +/// the chunk is not occupied, it can be released. +/// +/// \param[in] pd_predictor predictor +/// \return Number of bytes released. It may be smaller than the actual +/// released memory, because part of the memory is not managed by the +/// MemoryPool. +/// +PADDLE_CAPI_EXPORT extern uint64_t PD_PredictorTryShrinkMemory( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Destroy a predictor object +/// +/// \param[in] pd_predictor predictor +/// +PADDLE_CAPI_EXPORT extern void PD_PredictorDestroy( + __pd_take PD_Predictor* pd_predictor); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.cc b/paddle/fluid/inference/capi_exp/pd_tensor.cc new file mode 100644 index 0000000000000..9c661dea6f2bb --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/capi_exp/pd_tensor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_types.h" +#include "paddle/fluid/inference/capi_exp/pd_utils.h" +#include "paddle/fluid/inference/capi_exp/types_internal.h" +#include "paddle/fluid/inference/capi_exp/utils_internal.h" +#include "paddle/fluid/platform/enforce.h" + +#define CHECK_AND_CONVERT_PD_TENSOR \ + PADDLE_ENFORCE_NOT_NULL( \ + pd_tensor, paddle::platform::errors::InvalidArgument( \ + "The pointer of paddle tensor shouldn't be nullptr")); \ + auto& tensor = pd_tensor->tensor + +extern "C" { + +void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor) { delete pd_tensor; } +void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor, size_t shape_size, + int32_t* shape) { + CHECK_AND_CONVERT_PD_TENSOR; + std::vector shapes(shape_size); + for (size_t index = 0; index < shape_size; ++index) { + shapes[index] = shape[index]; + } + tensor->Reshape(shapes); +} + +#define REPEAT_ALL_DATA_TYPE(func) \ + func(float, Float) func(int64_t, Int64) func(int32_t, Int32) \ + func(uint8_t, Uint8) func(int8_t, Int8) + +#define PD_TENSOR_MUTABLE_DATA_IMPL(type, Type) \ + type* PD_TensorMutableData##Type(__pd_keep PD_Tensor* pd_tensor, \ + PD_PlaceType place) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + return tensor->mutable_data(paddle_infer::CvtToCxxPlaceType(place)); \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_MUTABLE_DATA_IMPL) +#undef PD_TENSOR_MUTABLE_DATA_IMPL + +#define PD_TENSOR_DATA_IMPL(type, Type) \ + type* PD_TensorData##Type(__pd_keep PD_Tensor* pd_tensor, \ + PD_PlaceType* place, int32_t* size) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + PADDLE_ENFORCE_NOT_NULL(place, \ + paddle::platform::errors::InvalidArgument( \ + "The pointer of place shouldn't be nullptr")); \ + PADDLE_ENFORCE_NOT_NULL(size, \ + paddle::platform::errors::InvalidArgument( \ + "The pointer of size shouldn't be nullptr")); \ + paddle_infer::PlaceType cxx_palce_type; \ + int cxx_size; \ + type* data = tensor->data(&cxx_palce_type, &cxx_size); \ + *place = paddle_infer::CvtFromCxxPlaceType(cxx_palce_type); \ + *size = static_cast(cxx_size); \ + return data; \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_DATA_IMPL) +#undef PD_TENSOR_DATA_IMPL + +#define PD_TENSOR_COPY_FROM_CPU_IMPL(type, Type) \ + void PD_TensorCopyFromCpu##Type(__pd_keep PD_Tensor* pd_tensor, \ + const type* data) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + tensor->CopyFromCpu(data); \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_FROM_CPU_IMPL) +#undef PD_TENSOR_COPY_FROM_CPU_IMPL + +#define PD_TENSOR_COPY_TO_CPU_IMPL(type, Type) \ + void PD_TensorCopyToCpu##Type(__pd_keep PD_Tensor* pd_tensor, type* data) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + tensor->CopyToCpu(data); \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_TO_CPU_IMPL) +#undef PD_TENSOR_COPY_TO_CPU_IMPL + +#undef REPEAT_ALL_DATA_TYPE + +__pd_give PD_OneDimArrayInt32* PD_TensorGetShape( + __pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return paddle_infer::CvtVecToOneDimArrayInt32(tensor->shape()); +} +void PD_TensorSetLod(__pd_keep PD_Tensor* pd_tensor, + __pd_keep PD_TwoDimArraySize* lod) { + CHECK_AND_CONVERT_PD_TENSOR; + tensor->SetLoD(paddle_infer::CvtTwoDimArrayToVecSize(lod)); +} +__pd_give PD_TwoDimArraySize* PD_TensorGetLod(__pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return paddle_infer::CvtVecToTwoDimArraySize(tensor->lod()); +} +const char* PD_TensorGetName(__pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return tensor->name().c_str(); +} +PD_DataType PD_TensorGetDataType(__pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return paddle_infer::CvtFromCxxDatatype(tensor->type()); +} + +} // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.h b/paddle/fluid/inference/capi_exp/pd_tensor.h new file mode 100644 index 0000000000000..29ea4b5d62e43 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_tensor.h @@ -0,0 +1,287 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_tensor.h +/// +/// \brief interface for paddle tensor +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include "pd_common.h" // NOLINT + +typedef struct PD_Tensor PD_Tensor; +typedef struct PD_OneDimArrayInt32 PD_OneDimArrayInt32; +typedef struct PD_TwoDimArraySize PD_TwoDimArraySize; + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Destroy the paddle tensor +/// +/// \param[in] pd_tensor tensor +/// +PADDLE_CAPI_EXPORT extern void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor); + +/// +/// \brief Reset the shape of the tensor. +/// Generally it's only used for the input tensor. +/// Reshape must be called before calling PD_TensorMutableData*() or +/// PD_TensorCopyFromCpu*() +/// +/// \param[in] pd_tensor tensor. +/// \param[in] shape_size The size of shape. +/// \param[in] shape The shape to set. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor, + size_t shape_size, + int32_t* shape); + +/// +/// \brief Get the memory pointer in CPU or GPU with 'float' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern float* PD_TensorMutableDataFloat( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'int64_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern int64_t* PD_TensorMutableDataInt64( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'int32_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern int32_t* PD_TensorMutableDataInt32( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'uint8_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorMutableDataUint8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'int8_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern int8_t* PD_TensorMutableDataInt8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern float* PD_TensorDataFloat( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern int64_t* PD_TensorDataInt64( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern int32_t* PD_TensorDataInt32( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorDataUint8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern int8_t* PD_TensorDataInt8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuFloat( + __pd_keep PD_Tensor* pd_tensor, const float* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt64( + __pd_keep PD_Tensor* pd_tensor, const int64_t* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt32( + __pd_keep PD_Tensor* pd_tensor, const int32_t* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuUint8( + __pd_keep PD_Tensor* pd_tensor, const uint8_t* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt8( + __pd_keep PD_Tensor* pd_tensor, const int8_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuFloat( + __pd_keep PD_Tensor* pd_tensor, float* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt64( + __pd_keep PD_Tensor* pd_tensor, int64_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt32( + __pd_keep PD_Tensor* pd_tensor, int32_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuUint8( + __pd_keep PD_Tensor* pd_tensor, uint8_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt8( + __pd_keep PD_Tensor* pd_tensor, int8_t* data); +/// +/// \brief Get the tensor shape +/// \param[in] pd_tensor tensor. +/// \return The tensor shape. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayInt32* PD_TensorGetShape( + __pd_keep PD_Tensor* pd_tensor); + +/// +/// \brief Set the tensor lod information +/// \param[in] pd_tensor tensor. +/// \param[in] lod lod information. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorSetLod( + __pd_keep PD_Tensor* pd_tensor, __pd_keep PD_TwoDimArraySize* lod); +/// +/// \brief Get the tensor lod information +/// \param[in] pd_tensor tensor. +/// \return the lod information. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_TwoDimArraySize* PD_TensorGetLod( + __pd_keep PD_Tensor* pd_tensor); +/// +/// \brief Get the tensor name +/// \param[in] pd_tensor tensor. +/// \return the tensor name. +/// +PADDLE_CAPI_EXPORT extern const char* PD_TensorGetName( + __pd_keep PD_Tensor* pd_tensor); +/// +/// \brief Get the tensor data type +/// \param[in] pd_tensor tensor. +/// \return the tensor data type. +/// +PADDLE_CAPI_EXPORT extern PD_DataType PD_TensorGetDataType( + __pd_keep PD_Tensor* pd_tensor); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/inference/capi_exp/pd_types.h b/paddle/fluid/inference/capi_exp/pd_types.h new file mode 100644 index 0000000000000..a5da2913a9b20 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_types.h @@ -0,0 +1,40 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "pd_common.h" // NOLINT + +typedef struct PD_OneDimArrayInt32 { + size_t size; + int32_t* data; +} PD_OneDimArrayInt32; // std::vector + +typedef struct PD_OneDimArraySize { + size_t size; + size_t* data; +} PD_OneDimArraySize; // std::vector + +typedef struct PD_OneDimArrayCstr { + size_t size; + char** data; +} PD_OneDimArrayCstr; // std::vector + +typedef struct PD_TwoDimArraySize { + size_t size; + PD_OneDimArraySize** data; +} PD_TwoDimArraySize; // std::vector> diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc new file mode 100644 index 0000000000000..2e762619f5567 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_utils.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_utils.h" +#include "paddle/fluid/inference/capi_exp/utils_internal.h" +#include "paddle/fluid/platform/enforce.h" + +#define DESTROY_ONE_DIM_ARRAY(type) \ + void PD_OneDimArray##type##Destroy(__pd_take PD_OneDimArray##type* array) { \ + if (array != NULL) { \ + delete[] array->data; \ + delete array; \ + } \ + } +#define CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type) \ + __pd_give PD_OneDimArray##Type* CvtVecToOneDimArray##Type( \ + const std::vector& vec) { \ + PD_OneDimArray##Type* array = new PD_OneDimArray##Type; \ + array->size = vec.size(); \ + array->data = vec.empty() ? NULL : new type[vec.size()]; \ + for (size_t index = 0; index < vec.size(); ++index) { \ + array->data[index] = vec[index]; \ + } \ + return array; \ + } +#define CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + std::vector CvtOneDimArrayToVec##Type( \ + __pd_keep const PD_OneDimArray##Type* array) { \ + std::vector vec; \ + if (array != NULL) { \ + vec.resize(array->size); \ + for (size_t index = 0; index < array->size; ++index) { \ + vec[index] = array->data[index]; \ + } \ + } \ + return vec; \ + } + +#define ONE_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \ + extern "C" { \ + DESTROY_ONE_DIM_ARRAY(Type); \ + } \ + namespace paddle_infer { \ + CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type) \ + CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + } + +ONE_DIM_ARRAY_UTILS_FUNC_IMPL(int32_t, Int32, int) +ONE_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t) + +#undef ONE_DIM_ARRAY_UTILS_FUNC_IMPL +#undef CONVERT_ONE_DIM_ARRAY_TO_VEC +#undef CONVERT_VEC_TO_ONE_DIM_ARRAY +#undef DESTROY_ONE_DIM_ARRAY + +void PD_OneDimArrayCstrDestroy(__pd_take PD_OneDimArrayCstr* array) { + if (array != NULL) { + if (array->size != 0) { + for (size_t index = 0; index < array->size; ++index) { + delete[] array->data[index]; + } + } + delete[] array->data; + delete array; + } +} +namespace paddle_infer { + +__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr( + const std::vector& vec) { + PD_OneDimArrayCstr* array = new PD_OneDimArrayCstr; + array->size = vec.size(); + array->data = vec.empty() ? NULL : new char*[vec.size()]; + for (size_t index = 0u; index < vec.size(); ++index) { + array->data[index] = new char[vec[index].size() + 1]; + memcpy(array->data[index], vec[index].c_str(), vec[index].size() + 1); + } + return array; +} + +std::vector CvtOneDimArrayToVecCstr( + __pd_keep const PD_OneDimArrayCstr* array) { + std::vector vec; + for (size_t index = 0; index < array->size; ++index) { + vec.emplace_back(array->data[index]); + } + return vec; +} + +} // namespace paddle_infer + +#define DESTROY_TWO_DIM_ARRAY(type) \ + void PD_TwoDimArray##type##Destroy(__pd_take PD_TwoDimArray##type* array) { \ + if (array != NULL) { \ + if (array->size != 0) { \ + for (size_t index = 0; index < array->size; ++index) { \ + PD_OneDimArray##type##Destroy(array->data[index]); \ + } \ + } \ + delete[] array->data; \ + delete array; \ + } \ + } +#define CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type) \ + __pd_give PD_TwoDimArray##Type* CvtVecToTwoDimArray##Type( \ + const std::vector>& vec) { \ + PD_TwoDimArray##Type* array = new PD_TwoDimArray##Type; \ + array->size = vec.size(); \ + array->data = vec.empty() ? NULL : new PD_OneDimArray##Type*[vec.size()]; \ + for (size_t index = 0; index < vec.size(); ++index) { \ + array->data[index] = CvtVecToOneDimArray##Type(vec[index]); \ + } \ + return array; \ + } +#define CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + std::vector> CvtTwoDimArrayToVec##Type( \ + __pd_keep const PD_TwoDimArray##Type* array) { \ + std::vector> vec; \ + if (array != NULL && array->size != 0) { \ + vec.resize(array->size); \ + for (size_t index = 0; index < array->size; ++index) { \ + vec[index] = CvtOneDimArrayToVec##Type((array->data)[index]); \ + } \ + } \ + return vec; \ + } +#define TWO_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \ + extern "C" { \ + DESTROY_TWO_DIM_ARRAY(Type); \ + } \ + namespace paddle_infer { \ + CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type) \ + CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + } + +TWO_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t) + +#undef TWO_DIM_ARRAY_UTILS_FUNC_IMPL +#undef CONVERT_TWO_DIM_ARRAY_TO_VEC +#undef CONVERT_VEC_TO_TWO_DIM_ARRAY +#undef DESTROY_TWO_DIM_ARRAY + +namespace paddle_infer { + +PlaceType CvtToCxxPlaceType(PD_PlaceType place_type) { + switch (place_type) { + case PD_PLACE_UNK: + return PlaceType::kUNK; + case PD_PLACE_CPU: + return PlaceType::kCPU; + case PD_PLACE_GPU: + return PlaceType::kGPU; + case PD_PLACE_XPU: + return PlaceType::kXPU; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unsupport paddle place type %d.", place_type)); + return PlaceType::kUNK; + } +} + +PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type) { + switch (place_type) { + case PlaceType::kCPU: + return PD_PLACE_CPU; + case PlaceType::kGPU: + return PD_PLACE_GPU; + case PlaceType::kXPU: + return PD_PLACE_XPU; + default: + return PD_PLACE_UNK; + } +} + +DataType CvtToCxxDatatype(PD_DataType data_type) { + switch (data_type) { + case PD_DATA_FLOAT32: + return DataType::FLOAT32; + case PD_DATA_INT64: + return DataType::INT64; + case PD_DATA_INT32: + return DataType::INT32; + case PD_DATA_UINT8: + return DataType::UINT8; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unsupport paddle data type %d.", data_type)); + return DataType::FLOAT32; + } +} + +PD_DataType CvtFromCxxDatatype(DataType data_type) { + switch (data_type) { + case DataType::FLOAT32: + return PD_DATA_FLOAT32; + case DataType::INT64: + return PD_DATA_INT64; + case DataType::INT32: + return PD_DATA_INT32; + case DataType::UINT8: + return PD_DATA_UINT8; + default: + return PD_DATA_UNK; + } +} + +} // namespace paddle_infer diff --git a/paddle/fluid/inference/capi_exp/pd_utils.h b/paddle/fluid/inference/capi_exp/pd_utils.h new file mode 100644 index 0000000000000..68e519d4bb5e9 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_utils.h @@ -0,0 +1,70 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_utils.h +/// +/// \brief Some utility function to destroy paddle struct. +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include +#include + +#include "pd_types.h" // NOLINT + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Destroy the PD_OneDimArrayInt32 object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_OneDimArrayInt32 object. +/// +PADDLE_CAPI_EXPORT extern void PD_OneDimArrayInt32Destroy( + __pd_take PD_OneDimArrayInt32* array); + +/// +/// \brief Destroy the PD_OneDimArrayCstr object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_OneDimArrayCstr object. +/// +PADDLE_CAPI_EXPORT extern void PD_OneDimArrayCstrDestroy( + __pd_take PD_OneDimArrayCstr* array); + +/// +/// \brief Destroy the PD_OneDimArraySize object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_OneDimArraySize object. +/// +PADDLE_CAPI_EXPORT extern void PD_OneDimArraySizeDestroy( + __pd_take PD_OneDimArraySize* array); + +/// +/// \brief Destroy the PD_TwoDimArraySize object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_TwoDimArraySize object. +/// +PADDLE_CAPI_EXPORT extern void PD_TwoDimArraySizeDestroy( + __pd_take PD_TwoDimArraySize* array); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/inference/capi_exp/types_internal.h b/paddle/fluid/inference/capi_exp/types_internal.h new file mode 100644 index 0000000000000..8a61b9a884c3b --- /dev/null +++ b/paddle/fluid/inference/capi_exp/types_internal.h @@ -0,0 +1,29 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_common.h" + +typedef struct PD_Tensor { + std::unique_ptr tensor; +} PD_Tensor; + +typedef struct PD_Predictor { + std::shared_ptr predictor; +} PD_Predictor; diff --git a/paddle/fluid/inference/capi_exp/utils_internal.h b/paddle/fluid/inference/capi_exp/utils_internal.h new file mode 100644 index 0000000000000..fbae512ecd855 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/utils_internal.h @@ -0,0 +1,153 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file utils_internal.h +/// +/// \brief Some utility function used to convert object between C Struct and C++ +/// Class. +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_types.h" + +namespace paddle_infer { + +/// +/// \brief Convert the 'std::vector' object to a 'PD_OneDimArrayInt32' +/// object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_OneDimArrayInt32* CvtVecToOneDimArrayInt32( + const std::vector& vec); + +/// +/// \brief Convert the 'PD_OneDimArrayInt32' object to a 'std::vector' +/// object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector CvtOneDimArrayToVecInt32( + __pd_keep const PD_OneDimArrayInt32* array); + +/// +/// \brief Convert the 'std::vector' object to a 'PD_OneDimArraySize' +/// object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_OneDimArraySize* CvtVecToOneDimArraySize( + const std::vector& vec); + +/// +/// \brief Convert the 'PD_OneDimArraySize' object to a 'std::vector' +/// object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector CvtOneDimArrayToVecSize( + __pd_keep const PD_OneDimArraySize* array); + +/// +/// \brief Convert the 'std::vector' object to a +/// 'PD_OneDimArrayCstr' object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr( + const std::vector& vec); + +/// +/// \brief Convert the 'PD_OneDimArrayCstr' object to a +/// 'std::vector' object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector CvtOneDimArrayToVecCstr( + __pd_keep const PD_OneDimArrayCstr* array); + +/// +/// \brief Convert the 'std::vector>' object to a +/// 'PD_TwoDimArraySize' object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_TwoDimArraySize* CvtVecToTwoDimArraySize( + const std::vector>& vec); + +/// +/// \brief Convert the 'PD_TwoDimArraySize' object to a +/// 'std::vector>' object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector> CvtTwoDimArrayToVecSize( + __pd_keep const PD_TwoDimArraySize* array); + +/// +/// \brief Convert the 'PD_PlaceType' object to a 'paddle_infer::PlaceType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +PlaceType CvtToCxxPlaceType(PD_PlaceType place_type); + +/// +/// \brief Convert the 'paddle_infer::PlaceType' object to a 'PD_PlaceType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type); + +/// +/// \brief Convert the 'PD_DataType' object to a 'paddle_infer::DataType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +DataType CvtToCxxDatatype(PD_DataType data_type); + +/// +/// \brief Convert the 'paddle_infer::DataType' object to a 'PD_DataType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +PD_DataType CvtFromCxxDatatype(DataType data_type); + +} // namespace paddle_infer diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index 59a786e46c98b..908e1ab990bb7 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -59,8 +59,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( #endif #ifdef LITE_SUBGRAPH_WITH_XPU + // Deprecated in Paddle-Lite release/v2.8 lite_cxx_config.set_xpu_workspace_l3_size_per_thread( cfg.xpu_l3_workspace_size); + lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size, + cfg.locked); + lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file); + lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision, + cfg.adaptive_seqlen); #endif // create predictor diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index 5ba487cc24d7d..a64ef1eda828b 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -42,6 +42,11 @@ struct EngineConfig { // for xpu size_t xpu_l3_workspace_size; + bool locked = false; + bool autotune = true; + std::string autotune_file = ""; + std::string precision = "int16"; + bool adaptive_seqlen = false; // for x86 or arm int cpu_math_library_num_threads{1}; diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index a6484a1355705..7ea41839cb939 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -38,38 +38,6 @@ class BatchNormOpConverter : public OpConverter { VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "Invalid input X's size of batch_norm TRT converter. " - "Expected 1, received %d.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Bias's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Input("Bias").size())); // Bias is a weight - PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Mean's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Input("Mean").size())); // Mean is a weight - PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Scale's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Input("Scale").size())); // Scale is a weight - PADDLE_ENFORCE_EQ( - op_desc.Input("Variance").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Variance's size of batch_norm TRT converter. " - "Expected 1, received %d.", - op_desc.Input("Variance").size())); // Variance is a weight - PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1, - platform::errors::InvalidArgument( - "Invalid output Y's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Output("Y").size())); - auto* X = engine_->GetITensor(op_desc.Input("X").front()); // Declare weights auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front()); diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 5515cd35daedc..ba47358b147db 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -36,18 +36,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Conv2d expect 1 input, but got %d input.", - op_desc.Input("Input").size())); - PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Conv2d expect 1 filter, but got %d filter.", - op_desc.Input("Filter").size())); - PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Conv2d expect 1 output, but got %d output.", - op_desc.Output("Output").size())); auto* X = engine->GetITensor(op_desc.Input("Input").front()); std::string filter_var_name = op_desc.Input("Filter").front(); @@ -61,13 +49,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, if (enable_int8) { #if IS_TRT_VERSION_GE(5000) - if (op_desc.Type() != "conv2d_transpose") { - PADDLE_ENFORCE_EQ( - op_desc.HasAttr("Input_scale"), true, - platform::errors::InvalidArgument("Input scale not found. TRT int8" - " requires conv/deconv to have " - "input quantization scales.")); - } float in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127; auto weight_scale = @@ -184,14 +165,6 @@ class Deconv2dOpConverter : public OpConverter { return layer; }, [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) { - // In trt Deconv, dilation should be 1, ohter values are not - // supported. - bool condition = (dilations.d[0] == 1 && dilations.d[1] == 1); - PADDLE_ENFORCE_EQ(condition, true, - platform::errors::InvalidArgument( - "In Deconv, Dilations must be (1, 1) for " - "tensorRT, but given (%d, %d)", - dilations.d[0], dilations.d[1])); }, "conv2d_transpose"); } diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 74057addecd1f..5419933e40736 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -43,25 +43,6 @@ class ElementwiseWeightOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer"; - PADDLE_ENFORCE_EQ( - op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"X\").size() " - "should equal to 1, but received Input(\"X\").size() = %u.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ( - op_desc.Input("Y").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"Y\").size() " - "should equal to 1, but received Input(\"Y\").size() = %u.", - op_desc.Input("Y").size())); // Y is a weight - PADDLE_ENFORCE_EQ( - op_desc.Output("Out").size(), 1, - platform::errors::InvalidArgument( - "The input op's Output(\"Out\").size() " - "should equal to 1, but reveceid Output(\"Out\").size() = %u.", - op_desc.Output("Out").size())); - auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); PADDLE_ENFORCE_NOT_NULL( @@ -193,25 +174,6 @@ class ElementwiseTensorOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); nvinfer1::ILayer* layer = nullptr; - PADDLE_ENFORCE_EQ( - op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"X\").size() " - "should equal to 1, but received Input(\"X\").size() = %u.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ( - op_desc.Input("Y").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"Y\").size() " - "should equal to 1, but received Input(\"Y\").size() = %u.", - op_desc.Input("Y").size())); // Y is a weight - PADDLE_ENFORCE_EQ( - op_desc.Output("Out").size(), 1, - platform::errors::InvalidArgument( - "The input op's Output(\"Out\").size() " - "should equal to 1, but received Output(\"Out\").size() = %u.", - op_desc.Output("Out").size())); - auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* Y = engine_->GetITensor(op_desc.Input("Y").front()); std::vector itensors; diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 957dfe0369898..57ac30b5f6bd7 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -31,16 +31,11 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { #if IS_TRT_VERSION_GE(6000) - VLOG(4) << "convert fluid swish op to tensorrt layer"; + VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer"; framework::OpDesc op_desc(op, nullptr); auto id_names = op_desc.Input("Ids"); auto emb_names = op_desc.Input("Embs"); - - PADDLE_ENFORCE_EQ(id_names.size(), emb_names.size(), - platform::errors::InvalidArgument( - "The id and emb size of fused EmbEltwiseLayerNormOp " - "should be same ")); int input_num = id_names.size(); // Declare inputs @@ -89,97 +84,92 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { int64_t bias_size = framework::product(bias_dims); int64_t scale_size = framework::product(scale_dims); nvinfer1::ILayer* layer = nullptr; + bool enable_int8 = op_desc.HasAttr("enable_int8"); - if (engine_->with_dynamic_shape()) { - if (engine_->use_oss()) { - int output_fp16 = static_cast((engine_->WithFp16() == 1) ? 1 : 0); - PADDLE_ENFORCE_EQ( - output_fp16, 1, - platform::errors::InvalidArgument( - "Only Precision::KHalf(fp16) is supported when infering " - "ernie(bert) model with config.EnableTensorRtOSS(). " - "But Precision::KFloat32 is setted.")); - const std::vector fields{ - {"bert_embeddings_layernorm_beta", bias, - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(bias_size)}, - {"bert_embeddings_layernorm_gamma", scale, - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(scale_size)}, - {"bert_embeddings_word_embeddings", input_embs[0], - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(emb_sizes[0])}, - {"bert_embeddings_token_type_embeddings", input_embs[2], - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(emb_sizes[2])}, - {"bert_embeddings_position_embeddings", input_embs[1], - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(emb_sizes[1])}, - {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1}, - }; - - // remember to free - nvinfer1::PluginFieldCollection* plugin_ptr = - static_cast( - malloc(sizeof(*plugin_ptr) + - fields.size() * sizeof(nvinfer1::PluginField))); - plugin_ptr->nbFields = static_cast(fields.size()); - plugin_ptr->fields = fields.data(); - - std::vector plugin_inputs; - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(0)->getName())); // word_embedding, - // eval_placeholder_0 - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(1)->getName())); // sent_embedding, - // eval_placeholder_1 - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(2)->getName())); // cu_seqlens, - // eval_placeholder_2 - auto max_seqlen_tensor = - engine_->GetITensor(engine_->network()->getInput(3)->getName()); - auto* shuffle_layer = TRT_ENGINE_ADD_LAYER( - engine_, Shuffle, - *const_cast(max_seqlen_tensor)); - nvinfer1::Dims shape_dim; - shape_dim.nbDims = 1; - shape_dim.d[0] = -1; - shuffle_layer->setReshapeDimensions(shape_dim); - plugin_inputs.emplace_back( - shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3 - - auto creator = GetPluginRegistry()->getPluginCreator( - "CustomEmbLayerNormPluginDynamic", "2"); - - auto plugin_obj = creator->createPlugin( - "CustomEmbLayerNormPluginDynamic", plugin_ptr); - auto plugin_layer = engine_->network()->addPluginV2( - plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); - layer = plugin_layer; - free(plugin_ptr); - auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", - {output_name, std::string("qkv_plugin_mask")}, - test_mode); - } else { - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); - plugin::DynamicPluginTensorRT* plugin = nullptr; - plugin = new plugin::EmbEltwiseLayernormPluginDynamic( - input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, - eps, with_fp16); - layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin); - auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, - test_mode); + if (engine_->use_oss()) { + int output_fp16 = static_cast((engine_->WithFp16() == 1) ? 1 : 0); + if (enable_int8) { + output_fp16 = 1; } + PADDLE_ENFORCE_EQ( + output_fp16, 1, + platform::errors::InvalidArgument( + "Only Precision::KHalf(fp16) is supported when infering " + "ernie(bert) model with config.EnableTensorRtOSS(). " + "But Precision::KFloat32 is setted.")); + const std::vector fields{ + {"bert_embeddings_layernorm_beta", bias, + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(bias_size)}, + {"bert_embeddings_layernorm_gamma", scale, + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(scale_size)}, + {"bert_embeddings_word_embeddings", input_embs[0], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[0])}, + {"bert_embeddings_token_type_embeddings", input_embs[2], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[2])}, + {"bert_embeddings_position_embeddings", input_embs[1], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[1])}, + {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1}, + }; + + // remember to free + nvinfer1::PluginFieldCollection* plugin_ptr = + static_cast( + malloc(sizeof(*plugin_ptr) + + fields.size() * sizeof(nvinfer1::PluginField))); + plugin_ptr->nbFields = static_cast(fields.size()); + plugin_ptr->fields = fields.data(); + + std::vector plugin_inputs; + plugin_inputs.emplace_back(engine_->GetITensor( + engine_->network()->getInput(0)->getName())); // word_embedding, + // eval_placeholder_0 + plugin_inputs.emplace_back(engine_->GetITensor( + engine_->network()->getInput(1)->getName())); // sent_embedding, + // eval_placeholder_1 + plugin_inputs.emplace_back(engine_->GetITensor( + engine_->network()->getInput(2)->getName())); // cu_seqlens, + // eval_placeholder_2 + auto max_seqlen_tensor = + engine_->GetITensor(engine_->network()->getInput(3)->getName()); + auto* shuffle_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor); + nvinfer1::Dims shape_dim; + shape_dim.nbDims = 1; + shape_dim.d[0] = -1; + shuffle_layer->setReshapeDimensions(shape_dim); + plugin_inputs.emplace_back( + shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3 + + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomEmbLayerNormPluginDynamic", "2"); + + auto plugin_obj = + creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr); + auto plugin_layer = engine_->network()->addPluginV2( + plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); + layer = plugin_layer; + free(plugin_ptr); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", + {output_name, std::string("qkv_plugin_mask")}, + test_mode); } else { - PADDLE_THROW(platform::errors::Fatal( - "You are running the Ernie(Bert) model in static" - "shape mode, which is not supported for the time being.\n" - "You can use the config.SetTRTDynamicShapeInfo(...) interface" - " to set the shape information to run the dynamic shape mode.")); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); + plugin::DynamicPluginTensorRT* plugin = nullptr; + plugin = new plugin::EmbEltwiseLayernormPluginDynamic( + input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, + eps, with_fp16); + layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, + test_mode); } #else diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 527d0ee208578..aebdb8f884c2c 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -106,8 +106,22 @@ class FcOpConverter : public OpConverter { auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output, TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) { - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, - n_output, weight.get(), bias.get()); + nvinfer1::ILayer* fc_layer = nullptr; + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in fc layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output, + nv_ksize, weight.get(), bias.get()); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + } else { + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, + n_output, weight.get(), bias.get()); + } auto output_name = op_desc.Output("Out").front(); if (activation_type == "relu") { @@ -146,66 +160,61 @@ class FcOpConverter : public OpConverter { if (engine_->with_dynamic_shape()) { // not NCHW layout, but NLP layout with added 'x 1 x 1' auto x_dim = X->getDimensions(); - if (x_dim.nbDims == 3 || x_dim.nbDims == 2) { - auto output_name = op_desc.Output("Out").front(); - // add shuffle before fc - nvinfer1::Dims reshape_before_fc_dim; - reshape_before_fc_dim.nbDims = x_dim.nbDims + 2; - for (int i = 0; i < x_dim.nbDims; i++) { - reshape_before_fc_dim.d[i] = 0; - } - reshape_before_fc_dim.d[x_dim.nbDims] = 1; - reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1; - auto* reshape_before_fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); - reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); - reshape_before_fc_layer->setName( - ("shuffle_before_fc(Output: " + output_name + ")").c_str()); + PADDLE_ENFORCE_LE( + x_dim.nbDims - x_num_col_dims, 3, + platform::errors::InvalidArgument( + "Params and input dims mismatch. Paddle-TRT FC " + "converter expects x_dim.nbDims - x_num_col_dims <= 3, but " + "x_dim.nbDims = %d, x_num_col_dims = %d.", + x_dim.nbDims, x_num_col_dims)); + auto output_name = op_desc.Output("Out").front(); + // add shuffle before fc + nvinfer1::Dims reshape_before_fc_dim; + // padding shape "x 1 x 1" + int padding_length = 3 - (x_dim.nbDims - x_num_col_dims); + reshape_before_fc_dim.nbDims = x_dim.nbDims + padding_length; + int cur_dim_index = reshape_before_fc_dim.nbDims - 1; + while (padding_length-- > 0) { + reshape_before_fc_dim.d[cur_dim_index--] = 1; + } + while (cur_dim_index >= 0) { + reshape_before_fc_dim.d[cur_dim_index--] = 0; + } - // add fc layer - auto* fc_layer = TRT_ENGINE_ADD_LAYER( - engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), - n_output, weight.get(), bias.get()); - fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str()); + auto* reshape_before_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); + reshape_before_fc_layer->setName( + ("shuffle_before_fc(Output: " + output_name + ")").c_str()); - // add shuffle after fc - nvinfer1::Dims reshape_after_fc_dim; - if (x_dim.nbDims == 3) { - if (x_num_col_dims == 2) { - reshape_after_fc_dim.nbDims = 3; - reshape_after_fc_dim.d[0] = 0; - reshape_after_fc_dim.d[1] = 0; - reshape_after_fc_dim.d[2] = 0; - } else { - reshape_after_fc_dim.nbDims = 2; - reshape_after_fc_dim.d[0] = 0; - auto dim = fc_layer->getOutput(0)->getDimensions(); - reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2]; - } - // x_dim.nbDims == 2 - } else { - reshape_after_fc_dim.nbDims = 2; - reshape_after_fc_dim.d[0] = 0; - reshape_after_fc_dim.d[1] = 0; - } - auto* reshape_after_fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0)); - reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim); + // add fc layer + auto* fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), + n_output, weight.get(), bias.get()); + fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str()); - if (activation_type == "relu") { - reshape_after_fc_layer->setName( - ("shuffle_after_fc(Output: " + output_name + ")").c_str()); - nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER( - engine_, Activation, *(reshape_after_fc_layer->getOutput(0)), - nvinfer1::ActivationType::kRELU); - RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle", - {output_name}, test_mode); - } else { - RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc", - {output_name}, test_mode); - } + // add shuffle after fc + nvinfer1::Dims reshape_after_fc_dim; + reshape_after_fc_dim.nbDims = x_num_col_dims + 1; + for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) { + reshape_after_fc_dim.d[i] = 0; + } + + auto* reshape_after_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0)); + reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim); + + if (activation_type == "relu") { + reshape_after_fc_layer->setName( + ("shuffle_after_fc(Output: " + output_name + ")").c_str()); + nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *(reshape_after_fc_layer->getOutput(0)), + nvinfer1::ActivationType::kRELU); + RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle", + {output_name}, test_mode); } else { - regist_fc(X, n_output, weight, bias); + RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc", + {output_name}, test_mode); } return; } @@ -229,13 +238,24 @@ class FcOpConverter : public OpConverter { "dims equals to 4, the last dim of input must be 1, but got %d", input_d[3])); } - for (int i = 0; i < 3; i++) { - if (i < input_dims) { - reshape_dim3[i] = input_d[i]; - } else { - reshape_dim3[i] = 1; + if (enable_int8) { + reshape_dim3[0] = 1; + for (int i = 0; i < 3; i++) { + reshape_dim3[0] *= input_d[i]; + if (i > 0) { + reshape_dim3[i] = 1; + } + } + } else { + for (int i = 0; i < 3; i++) { + if (i < input_dims) { + reshape_dim3[i] = input_d[i]; + } else { + reshape_dim3[i] = 1; + } } } + nvinfer1::Dims3 reshape_dim(reshape_dim3[0], reshape_dim3[1], reshape_dim3[2]); auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); @@ -249,11 +269,25 @@ class FcOpConverter : public OpConverter { platform::errors::InvalidArgument( "Invalid dimensions. When x_num_col_dims equals to " "2, input_dims should not be 1")); - for (int i = 0; i < 4; i++) { - if (i < input_dims) { - reshape_dim4[i] = input_d[i]; - } else { - reshape_dim4[i] = 1; + + if (enable_int8) { + for (int i = 0; i < 4; i++) { + if (i == 0) { + reshape_dim4[i] = input_d[i]; + } else { + reshape_dim4[i] = 1; + if (i < input_dims) { + reshape_dim4[1] *= input_d[i]; + } + } + } + } else { + for (int i = 0; i < 4; i++) { + if (i < input_dims) { + reshape_dim4[i] = input_d[i]; + } else { + reshape_dim4[i] = 1; + } } } nvinfer1::Dims4 reshape_dim(reshape_dim4[0], reshape_dim4[1], diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc index ca5b6a8b52e79..0436499cd4075 100644 --- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc @@ -47,15 +47,7 @@ class GeluOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs int input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ(input_num, 1, - platform::errors::InvalidArgument( - "gelu op has only 1 input, but got %d", input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ(output_num, 1, - platform::errors::InvalidArgument( - "gelu op has only 1 output, but got %d", output_num)); nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc index 9dc40ceec4809..7ef79e547d09a 100644 --- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc @@ -41,17 +41,7 @@ class HardSwishOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs int input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ( - input_num, 1, - platform::errors::InvalidArgument( - "HardSwish op has only 1 input, but got %d", input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ( - output_num, 1, - platform::errors::InvalidArgument( - "HardSwish op has only 1 output, but got %d", output_num)); const float threshold = op_desc.HasAttr("threshold") diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc index c1f266bacfec5..0b97b5d87a3d5 100644 --- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc @@ -25,25 +25,6 @@ class LayerNormOpConverter : public OpConverter { const framework::Scope& scope, bool test_mode) override { VLOG(4) << "convert a fluid layer_norm op to tensorrt layer_norm plugin"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ( - op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "input of layer_norm op converter should be 1, got %d", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1, - platform::errors::InvalidArgument( - "Bias of layer_norm op converter should be 1, got %d", - op_desc.Input("Bias").size())); // Bias is a weight - PADDLE_ENFORCE_EQ( - op_desc.Input("Scale").size(), 1, - platform::errors::InvalidArgument( - "Scale of layer_norm op converter should be 1, got %d", - op_desc.Input("Scale").size())); // Scale is a weight - PADDLE_ENFORCE_EQ( - op_desc.Output("Y").size(), 1, - platform::errors::InvalidArgument( - "output of layer_norm op converter should be 1, got %d", - op_desc.Input("Y").size())); auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front()); diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc index c2ffb3f3197c1..d6277b5208d5a 100644 --- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc @@ -36,21 +36,7 @@ class LeakyReluOpConverter : public OpConverter { VLOG(4) << "convert fluid leaky_relu op to tensorrt layer"; framework::OpDesc op_desc(op, nullptr); - // Declare inputs - size_t input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ(input_num, 1UL, - platform::errors::InvalidArgument( - "Invalid number of TRT leaky_relu op converter " - "inputs. Expected 1, but received %d", - input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ(output_num, 1UL, - platform::errors::InvalidArgument( - "Invalid number of TRT leaky_relu op converter " - "outputs. Expected 1, but received %d", - output_num)); // Get attrs float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha")); nvinfer1::ILayer* output_layer = nullptr; diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index 2008646549132..f2f45c694ab44 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -40,8 +40,25 @@ class MultiheadMatMulOpConverter : public OpConverter { auto* bias_v = scope.FindVar(bias_name); auto* bias_t = bias_v->GetMutable(); - float* weight_data = - engine_->GetWeightCPUData(weight_name, weight_t, false); + float* weight_data = nullptr; + bool enable_int8 = op_desc.HasAttr("enable_int8"); + float in_scale = 0.; + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("Input_scale"), true, + platform::errors::InvalidArgument( + "must have input scale in multihead layers in int8 mode")); + in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127; + auto weight_scale = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("weight_scale")); + weight_data = + engine_->GetWeightCPUData(weight_name, weight_t, true, weight_scale); + engine_->SetTensorDynamicRange(input, in_scale); + } else { + weight_data = engine_->GetWeightCPUData(weight_name, weight_t, false); + } + float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t, false); std::vector weight_data_tmp; weight_data_tmp.reserve(weight_t->numel()); @@ -117,8 +134,27 @@ class MultiheadMatMulOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_t->numel())}; - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, - n, weight, bias); + nvinfer1::ILayer* fc_layer = nullptr; + float dp_probs = 1.0 / 127.0; + if (enable_int8) { + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *input, n, + nv_ksize, weight, bias); + } else { + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n, + weight, bias); + } + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + dp_probs = out_scale / 127.0; + } auto mask_tensor = engine_->GetITensor("qkv_plugin_mask"); @@ -128,6 +164,9 @@ class MultiheadMatMulOpConverter : public OpConverter { int type = static_cast((engine_->WithFp16() == 1) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT); + if (enable_int8) { + type = static_cast(nvinfer1::DataType::kHALF); + } bool has_mask = true; int var_seqlen = 1; const std::vector fields{ @@ -136,7 +175,7 @@ class MultiheadMatMulOpConverter : public OpConverter { {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1}, {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1}, {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}, - }; + { "dq_probs", &dp_probs, nvinfer1::PluginFieldType::kFLOAT32, 1 }}; nvinfer1::PluginFieldCollection* plugin_collection = static_cast( malloc(sizeof(*plugin_collection) + diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc index e91a2ee13f4c2..3940cc5dce1b0 100644 --- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc @@ -65,13 +65,6 @@ class NearestInterpolateOpConverter : public OpConverter { scale_w = scale; } else { // axis are different in static/dynamic mode - PADDLE_ENFORCE_GT( - out_h, 0, platform::errors::InvalidArgument( - "out_h must be greater than 0 if scale is not set.")); - PADDLE_ENFORCE_GT( - out_w, 0, platform::errors::InvalidArgument( - "out_w must be greater than 0 if scale is not set.")); - bool with_dynamic = engine_->with_dynamic_shape(); int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic; diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc index 6bf50e4742dd2..d6711bbbd2cb5 100644 --- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc @@ -43,8 +43,6 @@ class PadOpConverter : public OpConverter { const std::vector paddings = BOOST_GET_CONST(std::vector, op_desc.GetAttr("paddings")); - const float pad_value = - BOOST_GET_CONST(float, op_desc.GetAttr("pad_value")); nvinfer1::Dims input_shape = input->getDimensions(); int nbDims = input_shape.nbDims; @@ -62,9 +60,6 @@ class PadOpConverter : public OpConverter { "(nbDims + 1) * 2 == pad_size. But " "received nbDims:%d, pad_size:%d.", nbDims, pad_size)); - PADDLE_ENFORCE_EQ(pad_value, 0.0, - platform::errors::InvalidArgument( - "The pad layer of TRT only support zero.")); nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]); nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index c10072602d7c5..90d6392fd6404 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -66,15 +66,6 @@ class Pool2dOpConverter : public OpConverter { VLOG(4) << "convert a fluid pool2d op to tensorrt pool2d layer without bias"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Pool2d expect 1 input, but got %d input.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Pool2d expect 1 Output, but got %d output.", - op_desc.Output("Out").size())); - auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]); nvinfer1::Dims input_shape = input1->getDimensions(); int input_dims = input_shape.nbDims; @@ -110,10 +101,6 @@ class Pool2dOpConverter : public OpConverter { nv_pool_type = nvinfer1::PoolingType::kAVERAGE; reduce_operation = nvinfer1::ReduceOperation::kAVG; plugin_pool_type = plugin::PoolPlugin::PoolType::avg; - } else { - PADDLE_THROW(platform::errors::Fatal( - "Wrong pool op type, the trt do not support the %s pool type.", - pool_type)); } nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index 74d77d8be4493..a8a36e1238168 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -31,19 +31,7 @@ class PReluOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs size_t input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ(input_num, 1UL, - platform::errors::InvalidArgument( - "Invalid input X's size of prelu TRT converter. " - "Expected 1, received %d.", - input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ(output_num, 1UL, - platform::errors::InvalidArgument( - "Invalid output Out's size of prelu TRT converter. " - "Expected 1, received %d.", - output_num)); // Get attrs std::string mode = BOOST_GET_CONST(std::string, op_desc.GetAttr("mode")); // diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc index 1329608aecd20..654fe7e013379 100644 --- a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc @@ -62,12 +62,6 @@ class RoiAlignOpConverter : public OpConverter { std::vector inputs{input_tensor, rois_tensor}; nvinfer1::ILayer* layer = nullptr; - PADDLE_ENFORCE_EQ( - engine_->with_dynamic_shape(), true, - platform::errors::InvalidArgument( - "TRT roi align plugin only accept the dynamic shape, because that " - "the roi_align will change the batch size.")); - auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic( data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio); auto roi_align_layer = engine_->network()->addPluginV2( diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc index bf1f82076a66c..0fdc262f7e740 100644 --- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc @@ -50,12 +50,6 @@ class ShuffleChannelOpConverter : public OpConverter { int w = input_dims.d[2]; int group = BOOST_GET_CONST(int, op_desc.GetAttr("group")); - if (engine_->with_dynamic_shape()) { - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, " - "the shuffle_channel op does not support dynamic shape yet")); - } - auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); nvinfer1::Dims4 reshape_dim(group, c / group, h, w); layer->setReshapeDimensions(reshape_dim); diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 3db7709acc22d..e621ac0514109 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -49,55 +49,60 @@ class SkipLayerNormOpConverter : public OpConverter { auto* scale = get_persistable_data("Scale", &scale_dims); int bias_size = framework::product(bias_dims); int scale_size = framework::product(scale_dims); + bool enable_int8 = op_desc.HasAttr("enable_int8"); nvinfer1::ILayer* layer = nullptr; - if (engine_->with_dynamic_shape()) { - if (engine_->use_oss()) { - auto creator = GetPluginRegistry()->getPluginCreator( - "CustomSkipLayerNormPluginDynamic", "2"); - assert(creator != nullptr); - int type = static_cast((engine_->WithFp16() == 1) - ? nvinfer1::DataType::kHALF - : nvinfer1::DataType::kFLOAT); - int ld = input1->getDimensions().d[2]; // hidden dimension - assert(ld > 0); - - const std::vector fields{ - {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, - {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1}, - {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size}, - {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size}, - }; - nvinfer1::PluginFieldCollection* pluginPtr = - static_cast( - malloc(sizeof(*pluginPtr) + - fields.size() * - sizeof(nvinfer1::PluginField))); // remember to free - pluginPtr->nbFields = static_cast(fields.size()); - pluginPtr->fields = fields.data(); - - auto pluginObj = creator->createPlugin( - "CustomSkipLayerNormPluginDynamic", pluginPtr); - auto plugin_layer = engine_->network()->addPluginV2( - inputs.data(), inputs.size(), *pluginObj); - - assert(plugin_layer != nullptr); - layer = plugin_layer; - } else { - float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - plugin::SkipLayerNormPluginDynamic* plugin = - new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size, - scale_size, eps, with_fp16); - layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin); + + if (engine_->use_oss()) { + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomSkipLayerNormPluginDynamic", "2"); + PADDLE_ENFORCE_NE( + creator, nullptr, + platform::errors::InvalidArgument( + "fail to get creator of CustomSkipLayerNormPluginDynamic")); + int type = static_cast((engine_->WithFp16() == 1) + ? nvinfer1::DataType::kHALF + : nvinfer1::DataType::kFLOAT); + int ld = input1->getDimensions().d[2]; // hidden dimension + PADDLE_ENFORCE_GT(ld, 0, platform::errors::InvalidArgument( + "in CustomSkipLayerNormPluginDynamic hidden " + "dimension should > 0")); + if (enable_int8) { + type = static_cast(nvinfer1::DataType::kHALF); } + + const std::vector fields{ + {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, + {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1}, + {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size}, + {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size}, + }; + nvinfer1::PluginFieldCollection* pluginPtr = + static_cast( + malloc(sizeof(*pluginPtr) + + fields.size() * + sizeof(nvinfer1::PluginField))); // remember to free + pluginPtr->nbFields = static_cast(fields.size()); + pluginPtr->fields = fields.data(); + + auto pluginObj = + creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr); + auto plugin_layer = engine_->network()->addPluginV2( + inputs.data(), inputs.size(), *pluginObj); + + PADDLE_ENFORCE_NE( + plugin_layer, nullptr, + platform::errors::InvalidArgument( + "fail to add CustomSkipLayerNormPluginDynamic layer")); + layer = plugin_layer; } else { - PADDLE_THROW(platform::errors::Fatal( - "You are running the Ernie(Bert) model in static" - "shape mode, which is not supported for the time being.\n" - "You can use the config.SetTRTDynamicShapeInfo(...) interface" - " to set the shape information to run the dynamic shape mode.")); + float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::SkipLayerNormPluginDynamic* plugin = + new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size, + scale_size, eps, with_fp16); + layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin); } auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index 38521d256419d..2ab024dff327f 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -31,6 +31,12 @@ class SliceOpConverter : public OpConverter { // Declare inputs auto* input = engine_->GetITensor(op_desc.Input("Input")[0]); + if (op_desc.HasAttr("out_threshold")) { + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(input, out_scale); + } + std::vector axes = BOOST_GET_CONST(std::vector, op_desc.GetAttr("axes")); std::vector starts = @@ -38,15 +44,6 @@ class SliceOpConverter : public OpConverter { std::vector ends = BOOST_GET_CONST(std::vector, op_desc.GetAttr("ends")); - PADDLE_ENFORCE_EQ( - starts.size(), axes.size(), - platform::errors::InvalidArgument( - "The size of starts must be equal to the size of axes.")); - PADDLE_ENFORCE_EQ( - ends.size(), axes.size(), - platform::errors::InvalidArgument( - "The size of ends must be equal to the size of axes.")); - auto input_dims = input->getDimensions(); if (!engine_->with_dynamic_shape()) { // notice that input shape is [CHW] without batch axis when input has @@ -56,10 +53,6 @@ class SliceOpConverter : public OpConverter { } input_dims.d[0] = 1; // fake batchsize, not useful here for (size_t i = 0; i < axes.size(); i++) { - // split on batch is not supported in TensorRT - PADDLE_ENFORCE_NE(axes[i], 0, platform::errors::InvalidArgument( - "Invalid slice axis. Slice on batch " - "axis is not supported in TensorRT")); if (starts[i] < 0) { starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0); } diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 75b317e7bfd90..47a6dd783a70c 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -33,17 +33,7 @@ class SplitOpConverter : public OpConverter { size_t output_num = op_desc.Output("Out").size(); // Get Attrs - PADDLE_ENFORCE_EQ(input_num, 1UL, - platform::errors::InvalidArgument( - "Invalid input X's size of split TRT converter. " - "Expected 1, received %d.", - input_num)); int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); - // split on batch is not supported in TensorRT - PADDLE_ENFORCE_NE( - axis, 0, - platform::errors::InvalidArgument( - "Invalid split axis. Split on batch is not supported in TensorRT")); std::vector output_lengths = BOOST_GET_CONST(std::vector, op_desc.GetAttr("sections")); diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc index d538c58879d78..6105e10799e55 100644 --- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc @@ -45,6 +45,11 @@ class StackOpConverter : public OpConverter { for (int i = 0; i < input_num; ++i) { inputs[i] = engine_->GetITensor(input[i]); + if (op_desc.HasAttr("out_threshold")) { + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(inputs[i], out_scale); + } } int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); @@ -53,26 +58,19 @@ class StackOpConverter : public OpConverter { } nvinfer1::ILayer* layer = nullptr; - if (engine_->with_dynamic_shape()) { #if IS_TRT_VERSION_GE(6000) - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - plugin::StackPluginDynamic* plugin = - new plugin::StackPluginDynamic(axis, input_num, with_fp16); - layer = engine_->AddDynamicPlugin(inputs, input_num, plugin); - assert(layer != nullptr); + bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::StackPluginDynamic* plugin = + new plugin::StackPluginDynamic(axis, input_num, with_fp16); + layer = engine_->AddDynamicPlugin(inputs, input_num, plugin); + PADDLE_ENFORCE_NOT_NULL( + layer, platform::errors::InvalidArgument( + "trt stack layer in converter could not be created.")); #else - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, need to confirm that " - "your TRT version is no less than 6.0")); + PADDLE_THROW(platform::errors::Fatal( + "You are running the TRT Dynamic Shape mode, need to confirm that " + "your TRT version is no less than 6.0")); #endif - } else { - PADDLE_THROW(platform::errors::Fatal( - "You are running the Ernie(Bert) model in static" - "shape mode, which is not supported for the time being.\n" - "You can use the config.SetTRTDynamicShapeInfo(...) interface" - " to set the shape information to run the dynamic shape mode.")); - } auto output_name = op_desc.Output("Y").front(); RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode); free(inputs); diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 971f99e691972..6158fd130bad8 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -60,6 +60,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { static nvinfer1::IPluginRegistry* GetPluginRegistry() { return static_cast(dy::getPluginRegistry()); } +static int GetInferLibVersion() { + return static_cast(dy::getInferLibVersion()); +} #endif // A logger for create TensorRT infer builder. @@ -67,9 +70,12 @@ class NaiveLogger : public nvinfer1::ILogger { public: void log(nvinfer1::ILogger::Severity severity, const char* msg) override { switch (severity) { - case Severity::kINFO: + case Severity::kVERBOSE: VLOG(3) << msg; break; + case Severity::kINFO: + VLOG(2) << msg; + break; case Severity::kWARNING: LOG(WARNING) << msg; break; diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index b681b098c8c76..c8dfc169535da 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -42,6 +42,10 @@ struct SimpleOpTypeSetTeller : public Teller { teller_set.insert("multihead_matmul"); teller_set.insert("skip_layernorm"); teller_set.insert("slice"); + int8_teller_set.insert("fused_embedding_eltwise_layernorm"); + int8_teller_set.insert("multihead_matmul"); + int8_teller_set.insert("skip_layernorm"); + int8_teller_set.insert("slice"); #endif #if IS_TRT_VERSION_GE(7130) teller_set.insert("group_norm"); @@ -61,6 +65,8 @@ struct SimpleOpTypeSetTeller : public Teller { // use this set for no calib int8. std::unordered_set int8_teller_set{"mul", "conv2d", + "matmul", + "stack", "conv2d_fusion", "pool2d", "relu", @@ -114,7 +120,6 @@ struct SimpleOpTypeSetTeller : public Teller { "yolo_box", "roi_align", "affine_channel", - "multiclass_nms", "nearest_interp", "anchor_generator", }; @@ -132,13 +137,93 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; for (auto& teller : tellers_) { - if (op_type == "pool2d" || op_type == "conv2d" || - op_type == "depthwise_conv2d" || op_type == "conv2d_transpose") { + if (op_type == "depthwise_conv2d") { std::vector paddings = BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); if (paddings.size() > 2) return false; } + + if (op_type == "pool2d") { + std::vector paddings = + BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); + if (paddings.size() > 2) return false; + if (desc.Input("X").size() != 1) { + VLOG(3) << "TRT Pool2d expect 1 input, but got " + << desc.Input("X").size(); + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "TRT Pool2d has only 1 output, but got " + << desc.Output("Out").size(); + return false; + } + if (!desc.HasAttr("pooling_type")) { + return false; + } else { + std::string pool_type = + BOOST_GET_CONST(std::string, desc.GetAttr("pooling_type")); + if (pool_type != "max" && pool_type != "avg") { + VLOG(3) << "Wrong pool op type, the trt do not support the " + << pool_type << " pool type."; + return false; + } + } + } + + if (op_type == "conv2d" || op_type == "conv2d_transpose" || + op_type == "conv2d_fusion") { + std::vector paddings = + BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); + + // conv2d and conv2d_transpose need padding check + if (paddings.size() > 2 && op_type != "conv2d_fusion") return false; + + if (desc.Input("Input").size() != 1) { + VLOG(3) << "TRT Conv2d expect 1 input, but got " + << desc.Input("Input").size() << " input."; + return false; + } + + if (desc.Input("Filter").size() != 1) { + VLOG(3) << "TRT Conv2d expect 1 filter, but got " + << desc.Input("Filter").size() << " filter."; + return false; + } + + if (desc.HasAttr("enable_int8")) { + if (op_type == "conv2d" || op_type == "conv2d_fusion") { + if (!desc.HasAttr("Input_scale")) { + VLOG(3) << "Input scale not found. TRT int8" + " requires conv/deconv to have " + "input quantization scales."; + return false; + } + } + } + + if (op_type == "conv2d_transpose") { + if (!desc.HasAttr("dilations")) { + return false; + } else { + const std::vector dilations = + BOOST_GET_CONST(std::vector, desc.GetAttr("dilations")); + if (dilations[0] != 1 || dilations[1] != 1) { + VLOG(3) << "In conv2d_transpose, Dilations must be (1, 1) for " + "tensorRT, but given (" + << dilations[0] << ", " << dilations[1] << ")"; + return false; + } + } + } + + if (desc.Output("Output").size() != 1) { + VLOG(3) << "TRT Conv2d expect 1 output, but got " + << desc.Output("Output").size() << " output."; + return false; + } + } + if (op_type == "matmul") { auto* block = desc.Block(); for (auto& param_name : desc.Inputs()) { @@ -146,7 +231,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, auto* var_desc = block->FindVar(var_name); const auto shape = var_desc->GetShape(); if (shape.size() < 3) { - VLOG(1) + VLOG(3) << "matmul op dims < 3 not supported in tensorrt, but got dims " << shape.size() << ", so jump it."; return false; @@ -184,7 +269,18 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (axis.size() >= nvinfer1::Dims::MAX_DIMS) return false; } } - if (op_type == "flatten2" || op_type == "flatten") { + if (op_type == "flatten2") { + // flatten doesn't support dynamic shape currently + if (!desc.HasAttr("axis")) { + return false; + } else { + if (with_dynamic_shape) return false; + int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); + if (axis != 1) return false; + } + } + + if (op_type == "flatten") { // flatten doesn't support dynamic shape currently if (!desc.HasAttr("axis")) { return false; @@ -224,7 +320,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, auto* var_desc = block->FindVar(var_name); const auto shape = var_desc->GetShape(); if (shape.size() != 3) { - VLOG(1) << "multiclass_nms op dims != 3 not supported in tensorrt, " + VLOG(3) << "multiclass_nms op dims != 3 not supported in tensorrt, " "but got dims " << shape.size() << ", so jump it."; return false; @@ -247,18 +343,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (registry == nullptr) return false; } - if (op_type == "fc" || op_type == "mul") { - const int x_num_col_dims = - desc.HasAttr("x_num_col_dims") - ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims")) - : (desc.HasAttr("in_num_col_dims") - ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims")) - : 1); - if (x_num_col_dims != 1 && x_num_col_dims != 2) { - return false; - } - } - if (op_type == "nearest_interp") { std::vector attrs{"data_layout", "interp_method", "align_corners", "scale", @@ -274,6 +358,25 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, auto interp_method = BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); if (interp_method != "nearest") return false; + + if (!desc.HasAttr("scale") || !desc.HasAttr("out_h") || + !desc.HasAttr("out_w")) { + return false; + } else { + auto scale = BOOST_GET_CONST(float, desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w")); + if (!(scale > 0.f && (out_h <= 0 && out_w <= 0))) { + if (out_h <= 0) { + VLOG(3) << "out_h must be greater than 0 if scale is not set."; + return false; + } + if (out_w <= 0) { + VLOG(3) << "out_w must be greater than 0 if scale is not set."; + return false; + } + } + } } if (op_type == "roi_align") { @@ -298,6 +401,235 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (spatial_scale <= 0.f) return false; } + if (op_type == "hard_swish") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "HardSwish op has only 1 input, but got " + << desc.Input("X").size(); + return false; + } + + if (desc.Output("Out").size() != 1) { + VLOG(3) << "HardSwish op has only 1 output, but got " + << desc.Output("Out").size(); + return false; + } + } + + if (op_type == "batch_norm") { + const std::vector bn_inputs = {"X", "Bias", "Mean", "Scale", + "Variance"}; + for (unsigned int i = 0; i < bn_inputs.size(); i++) { + if (desc.Input(bn_inputs[i]).size() != 1) { + VLOG(3) << "Invalid " << bn_inputs[i] + << "'s size of batch_norm TRT " + "converter. Expected 1, received " + << desc.Input(bn_inputs[i]).size() << "."; + return false; + } + } + + if (desc.Output("Y").size() != 1) { + VLOG(3) << "Invalid output Y's size of batch_norm TRT " + "converter. Expected 1, received " + << desc.Output("Y").size() << "."; + return false; + } + } + + if (op_type == "split") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid input X's size of split TRT converter. " + "Expected 1, received " + << desc.Input("X").size() << "."; + return false; + } + if (!desc.HasAttr("axis")) { + return false; + } else { + int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); + if (axis == 0) { + VLOG(3) << "Invalid split axis. Split on batch is not supported in " + "TensorRT"; + return false; + } + } + } + + if (op_type == "slice") { + if (!desc.HasAttr("axes") || !desc.HasAttr("starts") || + !desc.HasAttr("ends")) { + return false; + } else { + std::vector axes = + BOOST_GET_CONST(std::vector, desc.GetAttr("axes")); + std::vector starts = + BOOST_GET_CONST(std::vector, desc.GetAttr("starts")); + std::vector ends = + BOOST_GET_CONST(std::vector, desc.GetAttr("ends")); + if (axes.size() != starts.size() || axes.size() != ends.size()) { + return false; + } + if (!with_dynamic_shape) { + for (size_t i = 0; i < axes.size(); i++) { + if (axes[i] == 0) { + VLOG(3) << "Invalid slice axis. Slice on batch axis is not " + "supported in TensorRT"; + return false; + } + } + } + } + } + + if (op_type == "elementwise_add" || op_type == "elementwise_mul") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "The input op's Input(\"X\").size() " + "should equal to 1, but received Input(\"X\").size() = " + << desc.Input("X").size() << "."; + return false; + } + if (desc.Input("Y").size() != 1) { + VLOG(3) << "The input op's Input(\"Y\").size() " + "should equal to 1, but received Input(\"Y\").size() = " + << desc.Input("Y").size() << "."; + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "The input op's Output(\"Out\").size() " + "should equal to 1, but reveceid Output(\"Out\").size() = " + << desc.Output("Out").size() << "."; + return false; + } + } + + if (op_type == "stack") { + if (!with_dynamic_shape) { + VLOG(3) + << "static shape mode is not supported for TRT stack.\n" + "You can use the config.SetTRTDynamicShapeInfo(...) interface" + " to set the shape information to run the dynamic shape " + "mode."; + return false; + } + } + + if (op_type == "fused_embedding_eltwise_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) << "fused_embedding_eltwise_layernorm should run on dynamic " + "shape mode."; + return false; + } + if (desc.Input("Ids").size() != desc.Input("Embs").size()) { + VLOG(3) << "The id and emb size of fused EmbEltwiseLayerNormOp " + "should be same "; + return false; + } + } + + if (op_type == "gelu") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "gelu op has only 1 input, but got " + << desc.Input("X").size(); + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "gelu op has only 1 output, but got " + << desc.Output("Out").size(); + return false; + } + } + + if (op_type == "layer_norm") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "input of layer_norm op converter should be 1, got " + << desc.Input("X").size(); + return false; + } + if (desc.Input("Bias").size() != 1) { + VLOG(3) << "Bias of layer_norm op converter should be 1, got " + << desc.Input("Bias").size(); + return false; + } + if (desc.Input("Scale").size() != 1) { + VLOG(3) << "Scale of layer_norm op converter should be 1, got " + << desc.Input("Scale").size(); + return false; + } + if (desc.Output("Y").size() != 1) { + VLOG(3) << "output of layer_norm op converter should be 1, got " + << desc.Output("Y").size(); + return false; + } + } + + if (op_type == "leaky_relu") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid number of TRT leaky_relu op converter " + "inputs. Expected 1, but received " + << desc.Input("X").size(); + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "output of leaky_relu op converter should be 1, got " + << desc.Output("Out").size(); + return false; + } + } + + if (op_type == "pad") { + const float pad_value = BOOST_GET_CONST(float, desc.GetAttr("pad_value")); + if (pad_value != 0.0f) { + VLOG(3) << "The pad layer of TRT only support zero."; + return false; + } + } + + if (op_type == "prelu") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid input X's size of prelu TRT converter. " + "Expected 1, received " + << desc.Input("X").size() << "."; + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "Invalid output Out's size of prelu TRT converter. " + "Expected 1, received " + << desc.Output("Out").size() << "."; + return false; + } + } + + if (op_type == "roi_align") { + if (!with_dynamic_shape) { + VLOG(3) << "TRT roi align plugin only accept the dynamic shape, " + "because that " + "the roi_align will change the batch size."; + return false; + } + } + + if (op_type == "shuffle_channel") { + if (with_dynamic_shape) { + VLOG(3) << "You are running the TRT Dynamic Shape mode, " + "the shuffle_channel op does not support dynamic shape yet"; + return false; + } + } + + if (op_type == "skip_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) << "the skip_layernorm does not support static shape yet"; + return false; + } + } + + if (op_type == "multihead_matmul") { + if (!with_dynamic_shape) { + VLOG(3) << "the multihead_matmul does not support static shape yet"; + return false; + } + } + if ((*teller)(op_type, desc, use_no_calib_int8)) return true; } return false; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 75628adbe8a85..f74cd671d6dca 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -522,10 +522,10 @@ if(WITH_GPU AND TENSORRT_FOUND) inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/) - inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc + inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc + inference_analysis_test(test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) @@ -604,14 +604,23 @@ inference_analysis_test(lite_resnet50_test SRCS lite_resnet50_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_MODEL_DIR}) -inference_analysis_test(test_analyzer_capi SRCS analyzer_capi_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${RESNET50_MODEL_DIR}/model) +inference_analysis_test(test_analyzer_capi_exp SRCS analyzer_capi_exp_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c + ARGS --infer_model=${RESNET50_MODEL_DIR}/model) + +inference_analysis_test(test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c + ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) + +inference_analysis_test(test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c + ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) -inference_analysis_test(test_analyzer_capi_pd_tensor SRCS analyzer_capi_pd_tensor_tester.cc +if (NOT APPLE AND NOT WIN32) + inference_analysis_test(test_analyzer_capi_exp_pd_threads SRCS analyzer_capi_exp_pd_threads_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) - +endif() inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zerocopy_tensor_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${OCR_INSTALL_DIR}/model) @@ -621,17 +630,17 @@ inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_t ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt) if(WITH_MKLDNN) - inference_analysis_test(test_analyzer_capi_int SRCS analyzer_capi_int_tester.cc + inference_analysis_test(test_analyzer_capi_exp_int SRCS analyzer_capi_exp_int_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model) - endif() +endif() -inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc +inference_analysis_test(test_analyzer_capi_exp_ner SRCS analyzer_capi_exp_ner_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model) if(WITH_GPU) - inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc + inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_MODEL_DIR}) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc new file mode 100644 index 0000000000000..de9e2afd705f9 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc @@ -0,0 +1,160 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(PD_Config, gpu_interface) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + std::string prog_file = model_dir + "/__model__"; + std::string param_file = model_dir + "/__params__"; + std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir"; + const char* ops_name = "conv_2d"; + + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str()); + PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str()); + + PD_ConfigEnableUseGpu(config, 100, 0); + bool use_gpu = PD_ConfigUseGpu(config); + EXPECT_TRUE(use_gpu); + int init_size = PD_ConfigMemoryPoolInitSizeMb(config); + EXPECT_EQ(init_size, 100); + int gpu_device_id = PD_ConfigGpuDeviceId(config); + EXPECT_EQ(gpu_device_id, 0); + float frac = PD_ConfigFractionOfGpuMemoryForPool(config); + LOG(INFO) << frac; + PD_ConfigEnableCudnn(config); + bool cudnn = PD_ConfigCudnnEnabled(config); + EXPECT_TRUE(cudnn); + + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE, + TRUE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + + const char* tensor_name = "image"; + size_t shapes_num[1] = {4}; + int32_t min_shape[4] = {1, 3, 36, 36}; + int32_t max_shape[4] = {1, 3, 224, 224}; + int32_t opt_shape[4] = {1, 3, 224, 224}; + int32_t* min_shape_ptr = min_shape; + int32_t* max_shape_ptr = max_shape; + int32_t* opt_shape_ptr = opt_shape; + PD_ConfigSetTrtDynamicShapeInfo(config, 1, &tensor_name, shapes_num, + &min_shape_ptr, &max_shape_ptr, + &opt_shape_ptr, FALSE); + PD_ConfigDisableTensorRtOPs(config, 1, &ops_name); + PD_ConfigEnableTensorRtOSS(config); + bool oss_enabled = PD_ConfigTensorRtOssEnabled(config); + EXPECT_TRUE(oss_enabled); + + PD_ConfigEnableTensorRtDla(config, 4); + bool dla_enabled = PD_ConfigTensorRtDlaEnabled(config); + EXPECT_TRUE(dla_enabled); + + PD_ConfigEnableGpuMultiStream(config); + bool thread_local_thread = PD_ConfigThreadLocalStreamEnabled(config); + EXPECT_TRUE(thread_local_thread); + + PD_ConfigDisableGpu(config); + PD_ConfigDestroy(config); +} + +TEST(PD_Config, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config* config = PD_ConfigCreate(); + + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + int num_thread = PD_ConfigGetCpuMathLibraryNumThreads(config); + EXPECT_EQ(num_thread, 10); + + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModelDir(config, model_dir.c_str()); + PD_ConfigSetOptimCacheDir(config, + (FLAGS_infer_model + "/OptimCacheDir").c_str()); + const char* model_dir_ = PD_ConfigGetModelDir(config); + LOG(INFO) << model_dir_; + + PD_ConfigEnableUseGpu(config, 100, 0); + bool use_gpu = PD_ConfigUseGpu(config); + EXPECT_TRUE(use_gpu); + int device_id = PD_ConfigGpuDeviceId(config); + EXPECT_EQ(device_id, 0); + int init_size = PD_ConfigMemoryPoolInitSizeMb(config); + EXPECT_EQ(init_size, 100); + + float frac = PD_ConfigFractionOfGpuMemoryForPool(config); + LOG(INFO) << frac; + + PD_ConfigEnableCudnn(config); + bool cudnn = PD_ConfigCudnnEnabled(config); + EXPECT_TRUE(cudnn); + + PD_ConfigSwitchIrOptim(config, TRUE); + bool ir_optim = PD_ConfigIrOptim(config); + EXPECT_TRUE(ir_optim); + + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_FLOAT32, + FALSE, FALSE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + PD_ConfigEnableMemoryOptim(config); + bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config); + EXPECT_TRUE(memory_optim_enable); + PD_ConfigEnableProfile(config); + bool profiler_enable = PD_ConfigProfileEnabled(config); + EXPECT_TRUE(profiler_enable); + PD_ConfigSetInvalid(config); + bool is_valid = PD_ConfigIsValid(config); + EXPECT_FALSE(is_valid); + PD_ConfigDestroy(config); +} + +TEST(PD_Config, trt_int8) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigEnableUseGpu(config, 100, 0); + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE, + TRUE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + PD_ConfigDestroy(config); +} + +TEST(PD_Config, trt_fp16) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigEnableUseGpu(config, 100, 0); + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_HALF, FALSE, + FALSE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_PredictorDestroy(predictor); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc new file mode 100644 index 0000000000000..d3a15cb285772 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void predictor_run() { + std::string model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModelDir(config, model_dir.c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + LOG(INFO) << "The inputs' size is: " << input_names->size; + EXPECT_EQ(input_names->size, 2u); + + int32_t shape_0[4] = {1, 3, 224, 224}; + float data_0[1 * 3 * 224 * 224] = {0}; + PD_Tensor* input_0 = PD_PredictorGetInputHandle(predictor, "image"); + PD_TensorReshape(input_0, 4, shape_0); + PD_TensorCopyFromCpuFloat(input_0, data_0); + int32_t shape_1[2] = {1, 1}; + int64_t data_1[1] = {0}; + PD_Tensor* input_1 = PD_PredictorGetInputHandle(predictor, "label"); + PD_TensorReshape(input_1, 2, shape_1); + PD_TensorCopyFromCpuInt64(input_1, data_1); + + LOG(INFO) << "Run Inference in CAPI encapsulation. "; + EXPECT_TRUE(PD_PredictorRun(predictor)); + + PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor); + LOG(INFO) << "output size is: " << output_names->size; + for (size_t index = 0; index < output_names->size; ++index) { + LOG(INFO) << "output[" << index + << "]'s name is: " << output_names->data[index]; + PD_Tensor* output = + PD_PredictorGetOutputHandle(predictor, output_names->data[index]); + PD_OneDimArrayInt32* shape = PD_TensorGetShape(output); + LOG(INFO) << "output[" << index << "]'s shape_size is: " << shape->size; + int32_t out_size = 1; + for (size_t i = 0; i < shape->size; ++i) { + LOG(INFO) << "output[" << index << "]'s shape is: " << shape->data[i]; + out_size = out_size * shape->data[i]; + } + float* out_data = new float[out_size]; + PD_TensorCopyToCpuFloat(output, out_data); + LOG(INFO) << "output[" << index << "]'s DATA is: " << out_data[0]; + delete[] out_data; + PD_OneDimArrayInt32Destroy(shape); + PD_TensorDestroy(output); + } + PD_PredictorClearIntermediateTensor(predictor); + PD_PredictorTryShrinkMemory(predictor); + PD_OneDimArrayCstrDestroy(output_names); + PD_TensorDestroy(input_1); + PD_TensorDestroy(input_0); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +#ifdef PADDLE_WITH_MKLDNN +TEST(PD_PredictorRun, predictor_run) { predictor_run(); } +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc new file mode 100644 index 0000000000000..4369cd78dfa37 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(PD_PredictorRun, predictor_run) { + auto model_dir = FLAGS_infer_model; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/param").c_str()); + PD_ConfigDisableGpu(config); + + PD_Predictor *predictor = PD_PredictorCreate(config); + size_t input_num = PD_PredictorGetInputNum(predictor); + LOG(INFO) << "Input num: " << input_num; + size_t output_num = PD_PredictorGetOutputNum(predictor); + LOG(INFO) << "Output num: " << output_num; + + PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor); + EXPECT_EQ(input_names->size, 2u); + LOG(INFO) << "Predictor start run!"; + PD_Tensor *inputs[2]; + inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]); + inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]); + LOG(INFO) << "Predictor start run!"; + // inputs[0]: word, use lod memory in stack + int32_t shape_0[2] = {11, 1}; + int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9}; + size_t lod_layer_0[2] = {0, 11}; + PD_OneDimArraySize layer_0; + layer_0.size = 2; + layer_0.data = lod_layer_0; + PD_OneDimArraySize *layer_0_ptr = &layer_0; + PD_TwoDimArraySize lod_0; + lod_0.size = 1; + lod_0.data = &layer_0_ptr; + PD_TensorReshape(inputs[0], 2, shape_0); + PD_TensorCopyFromCpuInt64(inputs[0], data_0); + PD_TensorSetLod(inputs[0], &lod_0); + + // inputs[1]: mention, use lod memory in heap + int32_t shape_1[2] = {11, 1}; + int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2}; + PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize(); + lod_1_ptr->size = 1; + lod_1_ptr->data = new PD_OneDimArraySize *[1]; + lod_1_ptr->data[0] = new PD_OneDimArraySize(); + lod_1_ptr->data[0]->size = 2; + lod_1_ptr->data[0]->data = new size_t[2]; + lod_1_ptr->data[0]->data[0] = 0; + lod_1_ptr->data[0]->data[1] = 11; + + PD_TensorReshape(inputs[1], 2, shape_1); + PD_TensorCopyFromCpuInt64(inputs[1], data_1); + PD_TensorSetLod(inputs[1], lod_1_ptr); + // retrieve the lod memory + delete[] lod_1_ptr->data[0]->data; + delete lod_1_ptr->data[0]; + delete[] lod_1_ptr->data; + delete lod_1_ptr; + lod_1_ptr = nullptr; + + LOG(INFO) << "Predictor start run!"; + bool success = PD_PredictorRun(predictor); + EXPECT_TRUE(success); + LOG(INFO) << "Predictor run success!"; + PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor *output = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output); + + PD_TwoDimArraySizeDestroy(output_lod); + PD_TensorDestroy(output); + PD_OneDimArrayCstrDestroy(output_names); + + PD_TensorDestroy(inputs[0]); + PD_TensorDestroy(inputs[1]); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc new file mode 100644 index 0000000000000..18107704ae420 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(PD_Config, interface) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + std::string prog_file = model_dir + "/__model__"; + std::string param_file = model_dir + "/__params__"; + std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir"; + + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModelDir(config, model_dir.c_str()); + std::string model_dir_ = PD_ConfigGetModelDir(config); + EXPECT_EQ(model_dir, model_dir_); + + PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str()); + PD_ConfigSetProgFile(config, prog_file.c_str()); + PD_ConfigSetParamsFile(config, param_file.c_str()); + PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str()); + std::string prog_file_ = PD_ConfigGetProgFile(config); + std::string param_file_ = PD_ConfigGetParamsFile(config); + EXPECT_EQ(prog_file, prog_file_); + EXPECT_EQ(param_file, param_file_); + + PD_ConfigDisableFCPadding(config); + bool fc_padding = PD_ConfigUseFcPadding(config); + EXPECT_FALSE(fc_padding); + + PD_ConfigDisableGpu(config); + PD_ConfigSwitchIrOptim(config, TRUE); + bool ir_optim = PD_ConfigIrOptim(config); + EXPECT_TRUE(ir_optim); + +#ifndef PADDLE_WITH_LITE + PD_ConfigEnableLiteEngine(config, PD_PRECISION_FLOAT32, TRUE, 0, nullptr, 0, + nullptr); + bool lite_enabled = PD_ConfigLiteEngineEnabled(config); + EXPECT_TRUE(lite_enabled); +#endif + + PD_ConfigSwitchIrDebug(config, TRUE); +#ifdef PADDLE_WITH_MKLDNN + const char* ops_name = "conv_2d"; + PD_ConfigEnableMKLDNN(config); + PD_ConfigSetMkldnnOp(config, 1, &ops_name); + PD_ConfigSetMkldnnCacheCapacity(config, 100); + bool mkldnn_enabled = PD_ConfigMkldnnEnabled(config); + EXPECT_TRUE(mkldnn_enabled); + + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + int32_t cpu_threads = PD_ConfigGetCpuMathLibraryNumThreads(config); + EXPECT_EQ(cpu_threads, 10); + + PD_ConfigEnableMkldnnQuantizer(config); + bool mkldnn_qt_enabled = PD_ConfigMkldnnQuantizerEnabled(config); + EXPECT_TRUE(mkldnn_qt_enabled); + + PD_ConfigEnableMkldnnBfloat16(config); + PD_ConfigSetBfloat16Op(config, 1, &ops_name); + bool mkldnn_bf16_enabled = PD_ConfigMkldnnBfloat16Enabled(config); + EXPECT_TRUE(mkldnn_bf16_enabled); +#endif + + PD_ConfigEnableMemoryOptim(config); + bool memory_enabled = PD_ConfigMemoryOptimEnabled(config); + EXPECT_TRUE(memory_enabled); + + PD_ConfigEnableProfile(config); + bool profile_enabled = PD_ConfigProfileEnabled(config); + EXPECT_TRUE(profile_enabled); + + PD_ConfigDisableGlogInfo(config); + bool glog_diabled = PD_ConfigGlogInfoDisabled(config); + EXPECT_TRUE(glog_diabled); + + PD_ConfigSetInvalid(config); + bool is_valid = PD_ConfigIsValid(config); + EXPECT_FALSE(is_valid); + + PD_ConfigPartiallyRelease(config); + PD_ConfigDestroy(config); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc new file mode 100644 index 0000000000000..f4017fc5a7f34 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc @@ -0,0 +1,196 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void PD_run() { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + + int32_t shapes[4] = {1, 3, 300, 300}; + std::vector input(1 * 3 * 300 * 300, 0); + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuFloat(tensor, input.data()); + PD_TensorDataFloat(tensor, &place, &size); + PD_TensorMutableDataFloat(tensor, place); + + PD_TwoDimArraySize lod; + lod.size = 0; + lod.data = NULL; + PD_TensorSetLod(tensor, &lod); + + PD_PredictorRun(predictor); + + std::vector out_data; + PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor* output_tensor = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor); + int32_t out_num = std::accumulate(output_shape->data, + output_shape->data + output_shape->size, 1, + std::multiplies()); + out_data.resize(out_num); + PD_TensorCopyToCpuFloat(output_tensor, out_data.data()); + LOG(INFO) << "Output tensor name is: " << PD_TensorGetName(output_tensor); + PD_DataType data_type = PD_TensorGetDataType(output_tensor); + EXPECT_EQ(data_type, PD_DATA_FLOAT32); + + PD_TwoDimArraySize* out_lod = PD_TensorGetLod(output_tensor); + + PD_TwoDimArraySizeDestroy(out_lod); + PD_OneDimArrayInt32Destroy(output_shape); + PD_TensorDestroy(output_tensor); + PD_OneDimArrayCstrDestroy(output_names); + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} +TEST(PD_Tensor, PD_run) { PD_run(); } + +TEST(PD_Tensor, int32) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + int32_t shapes[4] = {1, 3, 300, 300}; + std::vector input(1 * 3 * 300 * 300, 0); + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuInt32(tensor, input.data()); + int32_t* data_ptr = PD_TensorDataInt32(tensor, &place, &size); + EXPECT_EQ(place, PD_PLACE_CPU); + EXPECT_EQ(size, 1 * 3 * 300 * 300); + int32_t* mutable_data_ptr = PD_TensorMutableDataInt32(tensor, place); + EXPECT_EQ(data_ptr, mutable_data_ptr); + + PD_DataType data_type = PD_TensorGetDataType(tensor); + EXPECT_EQ(data_type, PD_DATA_INT32); + PD_TensorCopyToCpuInt32(tensor, input.data()); + + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +TEST(PD_Tensor, int64) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + int32_t shapes[4] = {1, 3, 300, 300}; + std::vector input(1 * 3 * 300 * 300, 0); + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuInt64(tensor, input.data()); + int64_t* data_ptr = PD_TensorDataInt64(tensor, &place, &size); + EXPECT_EQ(place, PD_PLACE_CPU); + EXPECT_EQ(size, 1 * 3 * 300 * 300); + int64_t* mutable_data_ptr = PD_TensorMutableDataInt64(tensor, place); + EXPECT_EQ(data_ptr, mutable_data_ptr); + + PD_DataType data_type = PD_TensorGetDataType(tensor); + EXPECT_EQ(data_type, PD_DATA_INT64); + PD_TensorCopyToCpuInt64(tensor, input.data()); + + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +TEST(PD_Tensor, uint8) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + int32_t shapes[4] = {1, 3, 300, 300}; + uint8_t input[1 * 3 * 300 * 300] = {0}; + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuUint8(tensor, input); + uint8_t* data_ptr = PD_TensorDataUint8(tensor, &place, &size); + EXPECT_EQ(place, PD_PLACE_CPU); + EXPECT_EQ(size, 1 * 3 * 300 * 300); + uint8_t* mutable_data_ptr = PD_TensorMutableDataUint8(tensor, place); + EXPECT_EQ(data_ptr, mutable_data_ptr); + + PD_DataType data_type = PD_TensorGetDataType(tensor); + EXPECT_EQ(data_type, PD_DATA_UINT8); + PD_TensorCopyToCpuUint8(tensor, input); + + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +std::string read_file(std::string filename) { + std::ifstream file(filename); + return std::string((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); +} + +TEST(PD_Tensor, from_buffer) { + PD_Config* config = PD_ConfigCreate(); + std::string prog_file = FLAGS_infer_model + "/__model__"; + std::string params_file = FLAGS_infer_model + "/__params__"; + + std::string prog_str = read_file(prog_file); + std::string params_str = read_file(params_file); + + PD_ConfigSetModelBuffer(config, prog_str.c_str(), prog_str.size(), + params_str.c_str(), params_str.size()); + + bool model_from_memory = PD_ConfigModelFromMemory(config); + EXPECT_TRUE(model_from_memory); + PD_ConfigDestroy(config); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc new file mode 100644 index 0000000000000..8951c446b1f83 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +typedef struct RunParameter { + PD_Predictor* predictor; + int32_t* shapes; + size_t shape_size; + float* input_data; + int32_t out_size; + float* out_data; + int32_t thread_index; +} RunParameter; + +void* run(void* thread_param) { + struct RunParameter* param = (struct RunParameter*)thread_param; + LOG(INFO) << "Thread " << param->thread_index << " start run!"; + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(param->predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(param->predictor, input_names->data[0]); + PD_TensorReshape(tensor, param->shape_size, param->shapes); + PD_TensorCopyFromCpuFloat(tensor, param->input_data); + PD_PredictorRun(param->predictor); + PD_OneDimArrayCstr* output_names = + PD_PredictorGetOutputNames(param->predictor); + PD_Tensor* output_tensor = + PD_PredictorGetOutputHandle(param->predictor, output_names->data[0]); + PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor); + param->out_size = 1; + for (size_t index = 0; index < output_shape->size; ++index) { + param->out_size = param->out_size * output_shape->data[index]; + } + PD_OneDimArrayInt32Destroy(output_shape); + param->out_data = + reinterpret_cast(malloc(param->out_size * sizeof(float))); + PD_TensorCopyToCpuFloat(output_tensor, param->out_data); + PD_TensorDestroy(output_tensor); + PD_OneDimArrayCstrDestroy(output_names); + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + LOG(INFO) << "Thread " << param->thread_index << " end run!"; + return NULL; +} +void threads_run(int thread_num) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + + pthread_t* threads = + reinterpret_cast(malloc(thread_num * sizeof(pthread_t))); + RunParameter* params = reinterpret_cast( + malloc(thread_num * sizeof(RunParameter))); + int32_t shapes[4] = {1, 3, 300, 300}; + float* input = + reinterpret_cast(malloc(1 * 3 * 300 * 300 * sizeof(float))); + memset(input, 0, 1 * 3 * 300 * 300 * sizeof(float)); + for (int i = 0; i < thread_num; ++i) { + params[i].predictor = PD_PredictorClone(predictor); + params[i].shapes = shapes; + params[i].shape_size = 4; + params[i].input_data = input; + params[i].out_size = 0; + params[i].out_data = NULL; + params[i].thread_index = i; + pthread_create(&(threads[i]), NULL, run, (params + i)); + } + for (int i = 0; i < thread_num; ++i) { + pthread_join(threads[i], NULL); + } + ASSERT_GT(params[0].out_size, 0); + + for (int i = 1; i < thread_num; ++i) { + ASSERT_EQ(params[i].out_size, params[0].out_size); + for (int j = 0; j < params[i].out_size; ++j) { + ASSERT_EQ(params[i].out_data[j], params[0].out_data[j]); + } + } + for (int i = 0; i < thread_num; ++i) { + PD_PredictorDestroy(params[i].predictor); + free(params[i].out_data); + } + free(input); + free(params); + free(threads); + PD_PredictorDestroy(predictor); +} + +TEST(PD_Predictor, PD_multi_threads_run) { threads_run(10); } + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc new file mode 100644 index 0000000000000..11de1a5a6fab4 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void predictor_run() { + std::string model_dir = FLAGS_infer_model; + std::string prog_file = model_dir + "/model"; + std::string params_file = model_dir + "/params"; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str()); + + PD_Predictor *predictor = PD_PredictorCreate(config); + PD_Tensor *tensor = PD_PredictorGetInputHandle(predictor, "data"); + + const int batch_size = 1; + const int channels = 3; + const int height = 318; + const int width = 318; + float *input = new float[batch_size * channels * height * width](); + + int32_t shape[4] = {batch_size, channels, height, width}; + PD_TensorReshape(tensor, 4, shape); + PD_TensorCopyFromCpuFloat(tensor, input); + EXPECT_TRUE(PD_PredictorRun(predictor)); + + delete[] input; + PD_TensorDestroy(tensor); + PD_PredictorDestroy(predictor); +} + +TEST(PD_PredictorRun, predictor_run) { predictor_run(); } + +#ifdef PADDLE_WITH_MKLDNN +TEST(PD_Config, profile_mkldnn) { + std::string model_dir = FLAGS_infer_model; + std::string prog_file = model_dir + "/model"; + std::string params_file = model_dir + "/params"; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigEnableMKLDNN(config); + bool mkldnn_enable = PD_ConfigMkldnnEnabled(config); + EXPECT_TRUE(mkldnn_enable); + PD_ConfigEnableMkldnnQuantizer(config); + bool quantizer_enable = PD_ConfigMkldnnQuantizerEnabled(config); + EXPECT_TRUE(quantizer_enable); + PD_ConfigEnableMkldnnBfloat16(config); + PD_ConfigSetMkldnnCacheCapacity(config, 0); + PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str()); + PD_ConfigDestroy(config); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc new file mode 100644 index 0000000000000..f4fd04e85840d --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +#ifdef PADDLE_WITH_XPU +TEST(PD_Config, use_xpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config *config = PD_Config(); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModelDir(config, model_dir.c_str()); + PD_ConfigSetOptimCacheDir(config, + (FLAGS_infer_model + "/OptimCacheDir").c_str()); + const char *model_dir_ = PD_ConfigGetModelDir(config); + LOG(INFO) << model_dir_; + PD_ConfigEnableXpu(config, 0xfffc00); + bool use_xpu = PD_ConfigUseXpu(config); + EXPECT_TRUE(use_xpu); + int32_t device_id = PD_ConfigXpuDeviceId(config); + EXPECT_EQ(devive_id, 0); + PD_ConfigSwitchIrOptim(config, TRUE); + bool ir_optim = PD_IrOptim(config); + EXPECT_TRUE(ir_optim); + PD_ConfigEnableMemoryOptim(config); + bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config); + EXPECT_TRUE(memory_optim_enable); + PD_ConfigEnableProfile(config); + bool profiler_enable = PD_ConfigProfileEnabled(config); + EXPECT_TRUE(profiler_enable); + PD_SetInValid(config); + bool is_valid = PD_ConfigIsValid(config); + EXPECT_FALSE(is_valid); + PD_ConfigDestroy(config); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index d9a4503cc1e5f..730d49e8acd93 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -206,12 +206,20 @@ void Copy(platform::NPUPlace dst_place, if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(dst_place.device); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; + if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); } else { + // On NPU, async operation after sync operation is ok, while sync operation + // after async is not ok, since the async operation may not done. + // So, its needed to do wait before sync operation. + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); } @@ -226,12 +234,17 @@ void Copy(platform::CPUPlace dst_place, if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(src_place.device); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; + if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); } else { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(src_place))->Wait(); + platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); } @@ -254,6 +267,10 @@ void Copy(platform::NPUPlace dst_place, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } @@ -268,6 +285,10 @@ void Copy(platform::NPUPlace dst_place, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index dac8c7b03e517..6e11c64afc4bd 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -42,6 +42,10 @@ if (WITH_GPU AND TENSORRT_FOUND) add_subdirectory(tensorrt) endif() +if (WITH_DLNNE) + add_subdirectory(dlnne) +endif() + if (WITH_LITE) add_subdirectory(lite) endif() @@ -69,7 +73,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op +register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) @@ -124,6 +128,7 @@ if (WITH_ASCEND) endif() if (WITH_ASCEND_CL) + cc_test(assign_op_npu_test SRCS assign_op_npu_test.cc DEPS assign_op) cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner) endif() @@ -141,8 +146,8 @@ set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS}) set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(test_common_infer_shape_functions SRCS test_common_infer_shape_functions.cc DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op elementwise_add_op softmax_op softmax) -cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op) cc_test(gather_test SRCS gather_test.cc DEPS tensor) +cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) @@ -161,12 +166,22 @@ endif() cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS}) if (WITH_PYTHON) cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind) + cc_library(py_layer_op SRCS py_layer_op.cc DEPS op_registry python pybind) +endif() + +if (WITH_ASCEND_CL) + cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor) + cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op) endif() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") add_subdirectory(benchmark) cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op) +if (WITH_ASCEND_CL) + cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor) +endif() + if(WITH_MKLDNN) include(mkldnn/inplace_op_tests.cmake) @@ -180,3 +195,11 @@ if(WITH_UNITY_BUILD) # The specified link dependency needs to be displayed here. target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${COMMON_OP_DEPS}) endif() + +if(WITH_ASCEND_CL) +cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor) +endif() + +if (WITH_GPU OR WITH_ASCEND_CL) +cc_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc DEPS op_registry copy_cross_scope_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 94f2eb3672bd5..055909ba6f486 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -162,6 +162,12 @@ Sigmoid Activation Operator )DOC"; +UNUSED constexpr char SiluDoc[] = R"DOC( +Silu Activation Operator + +$$out = x * \\frac{1}{1 + e^{-x}}$$ +)DOC"; + UNUSED constexpr char LogSigmoidDoc[] = R"DOC( Logsigmoid Activation Operator @@ -697,6 +703,7 @@ It is recommended to use the defaults for this activation. }; REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc); +REGISTER_ACTIVATION_OP_MAKER(Silu, SiluDoc); REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc); REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc); REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); @@ -782,6 +789,26 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { } }; +template +class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("tanh_grad_grad"); + // input1: Out + op->SetInput("Out", this->Input("Out")); + // input2: ddx + op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + op->SetAttrMap(this->Attrs()); + // output: ddy + op->SetOutput("DOutNew", this->InputGrad("Out")); + op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + } +}; + // ReluGrad: dx = dy if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0 template @@ -1041,6 +1068,34 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +/* ========================== tanh register ============================= */ +REGISTER_OPERATOR( + tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::framework::OpDesc>, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::imperative::OpBase>, + std::conditional>(), + ops::ActFwdInplaceInferer, void>::type); +REGISTER_OPERATOR(tanh_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInferer, + ops::TanhDoubleGradMaker, + ops::TanhDoubleGradMaker) +REGISTER_OPERATOR( + tanh_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer); + +REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); +REGISTER_OP_CPU_KERNEL( + tanh_grad_grad, ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>); +/* ========================================================================== */ + /* ========================== relu register ============================= */ REGISTER_OPERATOR( relu, ops::ActivationOp, ops::ReluOpMaker, ops::ActivationOpInferVarType, diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 04f329088fafe..781a97c1ffcc1 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -468,6 +468,19 @@ REGISTER_OP_CUDA_KERNEL( ops::ReluGradGradFunctor>); /* ========================================================================== */ +/* =========================== tanh register ============================ */ +REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + tanh_grad_grad, + ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>); +/* ========================================================================== */ + /* =========================== sqrt register ============================= */ REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index fb5c4db91ec20..7245dea9cf949 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -258,6 +258,31 @@ struct SigmoidGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +// silu(x) = x / (1 + exp(-x)) +template +struct SiluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = static_cast(1) / (static_cast(1) + (-x).exp()); + out.device(d) = x * temp; + } +}; + +// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x})) +template +struct SiluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(1) + (-x).exp(); // 1+e^(-x) + auto temp2 = x * (-x).exp(); // x*e^(-x) + dx.device(d) = dout * ((static_cast(1) / temp1) * + (static_cast(1) + (temp2 / temp1))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + // Originally: logsigmoid(x) = -log (1 + exp(-x)) // For numerical stability, we can use the log-sum-exp trick: // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ @@ -366,6 +391,36 @@ struct TanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +template +struct TanhGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, const framework::Tensor* dOut, + framework::Tensor* dOutNew, framework::Tensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); + // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out + // * ddx) + if (dOutNew) { + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); + auto dout_new = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad")); + dout_new.device(*d) = + static_cast(-1) * dout * static_cast(2) * out * ddx; + } + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad")); + ddout.device(*d) = (static_cast(1) - out * out) * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + // tanhshrink(x) = x - tanh(x) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -1734,6 +1789,58 @@ inline void ExtractDoubleGradTensorWithInputDOut( } } +template +class TanhDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *ddX, *dOut; + framework::Tensor *dOutNew, *ddOut; + Out = ddX = dOut = nullptr; + dOutNew = ddOut = nullptr; + + // extract ddx(input) and out(input) + auto ddx_var = ctx.InputVar("DDX"); + auto out_var = ctx.InputVar("Out"); + PADDLE_ENFORCE_NOT_NULL( + ddx_var, platform::errors::NotFound( + "Cannot get input Variable ddx, variable name = %s", + ctx.InputName("DDX"))); + PADDLE_ENFORCE_NOT_NULL( + out_var, platform::errors::NotFound( + "Cannot get input Variable out, variable name = %s", + ctx.InputName("Out"))); + ddX = ctx.Input("DDX"); + Out = ctx.Input("Out"); + + // set output ddout + auto ddout_var = ctx.OutputVar("DDOut"); + if (ddout_var) { + ddOut = ctx.Output("DDOut"); + } + + // extract dOut(intput) + auto dout_var = ctx.InputVar("DOut"); + PADDLE_ENFORCE_NOT_NULL( + dout_var, platform::errors::NotFound( + "Cannot get input Variable dout_var, variable name = %s", + ctx.InputName("DOut"))); + dOut = ctx.Input("DOut"); + + // set output dout_new + auto dout_new_var = ctx.OutputVar("DOutNew"); + if (dout_new_var) { + dOutNew = ctx.Output("DOutNew"); + } + + if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); + if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); + auto& place = ctx.template device_context(); + Functor functor; + functor(place, Out, ddX, dOut, dOutNew, ddOut); + } +}; template class SquareDoubleGradKernel : public framework::OpKernel { @@ -2047,8 +2154,8 @@ struct LogGradGradFunctor : public BaseActivationFunctor { #define FOR_EACH_ACTIVATION_OP(__macro) \ __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ + __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor); \ __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc new file mode 100644 index 0000000000000..f368c65823055 --- /dev/null +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -0,0 +1,367 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class PowNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto factor = ctx.Attr("factor"); + + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Power", {*x}, {*out}, + {{"power", factor}, + {"scale", static_cast(1.0)}, + {"shift", static_cast(0.0)}}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class PowGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto factor = ctx.Attr("factor"); + + auto x_dims = x->dims(); + + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + + // NOTE(liym27): dx = dout * factor * x.pow(factor-1) + + // Step1: Compute x_pow = x.pow(factor-1) + Tensor x_pow(x->type()); + x_pow.mutable_data(x->dims(), place); + auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow}, + {{"power", factor - static_cast(1)}}); + runner_pow.Run(stream); + + // Step 2: Construct a broadcast factor, which has the same shape with x. + + // 2.1 Get a factor tensor with shape [1]. + Tensor factor_tensor(framework::proto::VarType::FP32); + factor_tensor.mutable_data({1}, place); + FillNpuTensorWithConstant(&factor_tensor, factor); + + // 2.2 Get the factor which has the shape with x and the same value with + // factor. + Tensor factor_bc_tensor(framework::proto::VarType::FP32); + factor_bc_tensor.mutable_data(x_dims, place); + auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor}, + {{"dims", framework::vectorize(x_dims)}}); + runner_bc.Run(stream); + + // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1) + Tensor x_power_mul_factor(x->type()); + x_power_mul_factor.mutable_data(x->dims(), place); + auto runner_mul_1 = + NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {}); + runner_mul_1.Run(stream); + + // Step 4: Compute dx = dout * factor * x.pow(factor-1) + dx->mutable_data(place); + auto runner_mul_2 = + NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {}); + runner_mul_2.Run(stream); + } +}; + +template +class ReluNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Relu", + { + *x, + }, + {*out}, {}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class ReluGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto stream = + ctx.template device_context() + .stream(); + + dx->mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {}); + + runner.Run(stream); + } +}; + +template +class SqrtNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class SqrtGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {}); + dx_runner.Run(stream); + } +}; + +template +class LogNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + Tensor one(x->type()); + one.mutable_data(x->dims(), place); + auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {}); + one_runner.Run(stream); + + Tensor sub(x->type()); + sub.mutable_data(x->dims(), place); + auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {}); + sub_runner.Run(stream); + + auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {}); + out_runner.Run(stream); + } +}; + +template +class LogGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {}); + runner.Run(stream); + } +}; + +template +class TanhNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class TanhGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* out = ctx.Input("Out"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {}); + dx_runner.Run(stream); + } +}; + +template +class SquareNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Square", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + pow, ops::PowNPUKernel, + ops::PowNPUKernel); + +REGISTER_OP_NPU_KERNEL( + pow_grad, ops::PowGradNPUKernel, + ops::PowGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + relu, ops::ReluNPUKernel, + ops::ReluNPUKernel); + +REGISTER_OP_NPU_KERNEL( + relu_grad, + ops::ReluGradNPUKernel, + ops::ReluGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + sqrt, ops::SqrtNPUKernel, + ops::SqrtNPUKernel); + +REGISTER_OP_NPU_KERNEL( + sqrt_grad, + ops::SqrtGradNPUKernel, + ops::SqrtGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + log, ops::LogNPUKernel, + ops::LogNPUKernel); + +REGISTER_OP_NPU_KERNEL( + log_grad, ops::LogGradNPUKernel, + ops::LogGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + tanh, ops::TanhNPUKernel, + ops::TanhNPUKernel); + +REGISTER_OP_NPU_KERNEL( + tanh_grad, + ops::TanhGradNPUKernel, + ops::TanhGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + square, ops::SquareNPUKernel, + ops::SquareNPUKernel, + ops::SquareNPUKernel); diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt index b3ff52a7ae119..2ea8bbcbc61df 100644 --- a/paddle/fluid/operators/amp/CMakeLists.txt +++ b/paddle/fluid/operators/amp/CMakeLists.txt @@ -4,3 +4,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() register_operators() + +if(WITH_ASCEND_CL) + cc_test(check_finite_and_unscale_op_npu_test SRCS check_finite_and_unscale_op_npu_test.cc DEPS op_registry check_finite_and_unscale_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/amp/alloc_float_status_op.cc b/paddle/fluid/operators/amp/alloc_float_status_op.cc new file mode 100644 index 0000000000000..181dd6eabe22d --- /dev/null +++ b/paddle/fluid/operators/amp/alloc_float_status_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class AllocFloatStatusOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasOutput("FloatStatus"), "Output", "FloatStatus", + "alloc_float_status"); + ctx->SetOutputDim("FloatStatus", {8}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); + } +}; + +class AllocFloatStatusMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("FloatStatus", + "(Tensor) of shape {8} that holds the float status."); + AddComment(R"DOC( + Produces a float Tensor that holds the float status +)DOC"); + } +}; + +template +class AllocFloatStatusKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Operator alloc_float_status is not supported on CPU")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR( + alloc_float_status, ops::AllocFloatStatusOp, ops::AllocFloatStatusMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(alloc_float_status, + ops::AllocFloatStatusKernel); diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc new file mode 100644 index 0000000000000..fe5b08af52a62 --- /dev/null +++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class AllocFloatStatusKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* float_status = ctx.Output("FloatStatus"); + float_status->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + alloc_float_status, + ops::AllocFloatStatusKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc index 9d78936ad5f7f..c7520dbd34f6a 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc @@ -60,6 +60,12 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Scale", "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale " "operator."); +#ifdef PADDLE_WITH_ASCEND_CL + AddInput("FloatStatus", + "(Tensor) 1-dim tensor of shape [8], allocated by " + "alloc_float_status op") + .AsDispensable(); +#endif AddOutput("Out", "(Tensors) The scaled output tensor of " "check_finite_and_unscale operator.") diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index 6840e4847c4c6..2c3a9c366e4fd 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -26,18 +26,48 @@ __global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) { } template -__global__ void CheckFiniteAndUnscale(const T* in, const MT* scale, int num, - bool* found_inf, T* out) { - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - - if (idx < num) { - MT val = static_cast(in[idx]) * (*scale); +__global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale, + int64_t size, int64_t* starts, + bool* found_inf, T** outs) { + const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x; + + // copy starts array from global memory to shared memory + extern __shared__ int64_t s_starts[]; + for (int i = threadIdx.x; i <= size; i += blockDim.x) { + s_starts[i] = starts[i]; + } + __syncthreads(); + + const int64_t num = s_starts[size]; + int pre_xs_index = 0; + bool t_found_inf = false; + const MT t_scale = *scale; + for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) { + // get the xs's index of thread + int xs_index = pre_xs_index; + while (idx < s_starts[xs_index]) xs_index++; + // avoid some tensor's numel is zero + while (idx >= s_starts[xs_index]) xs_index++; + pre_xs_index = xs_index - 1; + + // get in data and out data + const T* in = xs[pre_xs_index]; + T* out = outs[pre_xs_index]; + int64_t in_idx = idx - s_starts[pre_xs_index]; + + // Unscale + MT val = static_cast(in[in_idx]) * t_scale; T narrow_val = static_cast(val); - out[idx] = narrow_val; + out[in_idx] = narrow_val; + + // CheckFinite if (!isfinite(narrow_val)) { - *found_inf = true; + t_found_inf = true; } } + if (t_found_inf) { + *found_inf = true; + } } template @@ -63,20 +93,53 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { InverseAndMemset<<<1, 1, 0, dev_ctx.stream()>>>( scale_data, inverse_scale_v, found_inf_data); - for (size_t i = 0; i < xs.size(); ++i) { - const auto* x = xs[i]; - auto* out = outs[i]; - const T* x_data = x->data(); - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - - int num = x->numel(); - int block = 1024; - int grid = (num + block - 1) / block; - VLOG(3) << "launch kernel"; - CheckFiniteAndUnscale<<>>( - x_data, inverse_scale_v, num, found_inf_data, out_data); - VLOG(3) << "finish kernel"; + size_t xs_size = xs.size(); + // calculate each tensor's start index and copy to device + auto h_starts_tensor = + memory::Alloc(platform::CPUPlace(), (xs_size + 1) * sizeof(int64_t)); + int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); + + auto d_starts_tensor = + memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); + + h_starts[0] = 0; + for (int i = 1; i <= xs_size; i++) { + // the start index value of each tensor is + // the sum of previous tensor's size + h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); + } + int64_t total_num = h_starts[xs_size]; + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_starts, platform::CPUPlace(), h_starts, + (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()); + + // copy each tensor's data address to device + auto h_mem = memory::Alloc(platform::CPUPlace(), 2 * xs_size * sizeof(T*)); + const T** h_xs = reinterpret_cast(h_mem->ptr()); + T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; + + auto d_mem = memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*)); + const T** d_xs = reinterpret_cast(d_mem->ptr()); + T** d_outs = reinterpret_cast(d_mem->ptr()) + xs_size; + + for (size_t i = 0; i < xs_size; ++i) { + h_xs[i] = xs[i]->data(); + h_outs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs, + platform::CPUPlace(), h_xs, 2 * xs_size * sizeof(T*), + dev_ctx.stream()); + + // Launch Kernel + int block = 1024; + int block_num = block * 20; // each thread deal with 20 number + int grid = (total_num + block_num - 1) / block_num; + VLOG(3) << "launch kernel"; + CheckFiniteAndUnscale<<< + grid, block, (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>( + d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs); + VLOG(3) << "finish kernel"; } }; } // namespace operators diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc new file mode 100644 index 0000000000000..8fd45326e4ec6 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +// NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA. +// On NPU, we do not really check the data of input tensors, +// but use NPUGetFloatStatus to check whether the nan/inf occurs on device, +// and clear it after this op. +// Which may leads to wrong result if the input tensors is not calculated +// on NPU device, but got from other way, for example, feeding. +template +class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + const auto* float_status = ctx.Input("FloatStatus"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + found_inf->mutable_data(ctx.GetPlace()); + + bool found_inf_data = false; + + auto stream = + ctx.template device_context() + .stream(); + + // step1: inverse scale(RealDiv) + Tensor const_tensor; + const_tensor.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&const_tensor, static_cast(1.0)); + + // Inverse(1.0/scale) + Tensor* tmp_inverse_out = const_cast(scale); + Tensor inverse_out(scale->type()); + inverse_out.Resize(scale->dims()); + inverse_out.mutable_data(ctx.GetPlace()); + auto runner_inverse = + NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {}); + runner_inverse.Run(stream); + tmp_inverse_out = &inverse_out; + + // NOTE(zhiqiu): + Tensor tmp; + tmp.mutable_data({8}, ctx.GetPlace()); + + // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place. + // tmp is only placeholder. + auto runner_float_status = + NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp}, + {{"message", std::string("check_nan_and_inf")}}); + runner_float_status.Run(stream); + + Tensor sum; + sum.mutable_data({1}, ctx.GetPlace()); + auto runner_reduce_sum = + NpuOpRunner("ReduceSumD", {*float_status}, {sum}, + {{"axes", std::vector{0}}, {"keep_dims", true}}); + runner_reduce_sum.Run(stream); + + std::vector sum_vec; + TensorToVector( + sum, ctx.template device_context(), + &sum_vec); + found_inf_data = (sum_vec[0] > 1); + + VLOG(4) << "found_inf_data:" << found_inf_data; + + for (size_t i = 0; i < xs.size(); ++i) { + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(ctx.GetPlace()); + if (!found_inf_data) { + // MatMul + auto runner_matmul = + NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); + runner_matmul.Run(stream); + } + } + + // set found_inf to true + VLOG(4) << "found overflow:" << found_inf_data; + Tensor found_inf_tensor; + found_inf_tensor.Resize({1}); + bool* is_found_inf = + found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); + *is_found_inf = found_inf_data; + + framework::TensorCopy( + found_inf_tensor, ctx.GetPlace(), + ctx.template device_context(), found_inf); + ctx.template device_context().Wait(); + + auto runner_clear_status = + NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp}); + runner_clear_status.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleNPUKernel, + ops::CheckFiniteAndUnscaleNPUKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc new file mode 100644 index 0000000000000..a80b83f0cbe51 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/enforce.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +using Tensor = paddle::framework::Tensor; + +USE_OP(check_finite_and_unscale); +USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU); + +struct InputVars { + std::string name; + f::LoDTensor *tensor; +}; + +template +void Compare(f::Scope *scope, const p::DeviceContext &ctx) { + const f::DDim dims = f::make_ddim({2, 2}); + auto place = ctx.GetPlace(); + + // init input + std::vector input_names = { + {"x", scope->Var("x")->GetMutable()}, + {"x1", scope->Var("x1")->GetMutable()}}; + + auto *scale = scope->Var("scale")->GetMutable(); + + // init output + auto *out = scope->Var("out")->GetMutable(); + auto *out1 = scope->Var("out1")->GetMutable(); + auto *found_inf = scope->Var("found_inf")->GetMutable(); + + // Initialize input data + const int num_inputs = input_names.size(); + size_t numel = static_cast(f::product(dims)); + + for (int i = 0; i < num_inputs; ++i) { + std::vector init_xs; + for (size_t j = 0; j < numel; ++j) { + if (j == 0) { + init_xs.push_back(static_cast(NAN)); + } else { + init_xs.push_back(static_cast(j + 1)); + } + } + f::TensorFromVector(init_xs, ctx, input_names[i].tensor); + input_names[i].tensor->Resize(dims); + } + + f::TensorFromVector(std::vector{static_cast(0.5)}, ctx, scale); + + ctx.Wait(); + + // run + f::AttributeMap attrs; + auto op = f::OpRegistry::CreateOp( + "check_finite_and_unscale", {{"X", {"x", "x1"}}, {"Scale", {"scale"}}}, + {{"Out", {"out", "out1"}}, {"FoundInfinite", {"found_inf"}}}, attrs); + op->Run(*scope, place); + ctx.Wait(); + + // out0 + std::vector out_vec; + f::TensorToVector(*out, ctx, &out_vec); + EXPECT_EQ(out_vec.size(), static_cast(4)); + for (size_t j = 0; j < out_vec.size(); ++j) { + VLOG(3) << "out_vec[" << j << "]:" << out_vec[j]; + } + + ctx.Wait(); + + // out0 + std::vector out1_vec; + f::TensorToVector(*out1, ctx, &out1_vec); + EXPECT_EQ(out1_vec.size(), static_cast(4)); + for (size_t j = 0; j < out1_vec.size(); ++j) { + VLOG(3) << "out1_vec[" << j << "]:" << out1_vec[j]; + } + + ctx.Wait(); + + // out found_inf + Tensor found_inf_tensor; + found_inf_tensor.Resize({1}); + bool *found_inf_data = + found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); + f::TensorCopy(*found_inf, place, &found_inf_tensor); + EXPECT_TRUE(*found_inf_data); + + ctx.Wait(); +} + +TEST(check_finite_and_unscale, NPU_fp32) { + f::Scope scope; + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); +} + +TEST(check_finite_and_unscale, NPU_fp16) { + f::Scope scope; + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); +} diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc new file mode 100644 index 0000000000000..45b28bf61e5d6 --- /dev/null +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -0,0 +1,219 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +void Update(const platform::NPUDeviceContext& ctx, + const std::vector found_inf_vec, + const Tensor* pre_loss_scaling_tensor, const Tensor* good_in_tensor, + const Tensor* bad_in_tensor, const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, Tensor* updated_loss_scaling_tensor, + Tensor* good_out_tensor, Tensor* bad_out_tensor) { + auto place = ctx.GetPlace(); + auto stream = ctx.stream(); + if (found_inf_vec[0]) { + // good_out_data = 0 + auto g = good_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + good_out_tensor->numel() * sizeof(int), stream); + // bad_out_data = bad_in_data + 1 + Tensor factor_tensor(bad_out_tensor->type()); + factor_tensor.mutable_data({1}, place); + FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); + auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, + {*bad_out_tensor}, {}); + runner_p2.Run(stream); + + std::vector bad_out_data; + TensorToVector(*bad_out_tensor, ctx, &bad_out_data); + if (bad_out_data[0] == decr_every_n_nan_or_inf) { + auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", decr_ratio}, + {"shift", static_cast(0)}}); + + runner_p3.Run(stream); + + std::vector new_loss_scaling; + TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); + if (new_loss_scaling[0] < static_cast(1)) { + // updated_loss_scaling_data = 1 + auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(1)}}); + + runner_p4.Run(stream); + } + + // bad_out_data = 0 + auto b = bad_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(b), 0, + bad_out_tensor->numel() * sizeof(int), stream); + } + } else { + // bad_out_data = 0 + auto b = bad_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(b), 0, + bad_out_tensor->numel() * sizeof(int), stream); + + // good_out_data = good_in_data + 1 + Tensor factor_tensor(good_out_tensor->type()); + factor_tensor.mutable_data({1}, place); + FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); + auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, + {*good_out_tensor}, {}); + runner_p2.Run(stream); + + std::vector good_out_data; + TensorToVector(*good_out_tensor, ctx, &good_out_data); + + if (good_out_data[0] == incr_every_n_steps) { + auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", incr_ratio}, + {"shift", static_cast(0)}}); + runner_p3.Run(stream); + + std::vector new_loss_scaling; + TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); + if (!std::isfinite(new_loss_scaling[0])) { + // updated_loss_scaling_data = pre_loss_scaling_data + auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(1)}, + {"shift", static_cast(0)}}); + + runner_p4.Run(stream); + } + // good_out_data = 0 + auto g = good_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + good_out_tensor->numel() * sizeof(int), stream); + } + } +} + +template +class UpdateLossScalingFunctor { + public: + void operator()(const platform::NPUDeviceContext& dev_ctx, + const std::vector found_inf_vec, + const Tensor* pre_loss_scaling_tensor, + const Tensor* good_in_tensor, const Tensor* bad_in_tensor, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, Tensor* updated_loss_scaling_tensor, + Tensor* good_out_tensor, Tensor* bad_out_tensor) const { + Update(dev_ctx, found_inf_vec, pre_loss_scaling_tensor, good_in_tensor, + bad_in_tensor, incr_every_n_steps, decr_every_n_nan_or_inf, + incr_ratio, decr_ratio, updated_loss_scaling_tensor, + good_out_tensor, bad_out_tensor); + } +}; + +template +class LazyZerosNPU { + public: + void operator()(const platform::NPUDeviceContext& dev_ctx, + const std::vector found_inf_vec, + const std::vector& xs, + const std::vector& outs) const { + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + if (found_inf_vec[0]) { + VLOG(4) << "-- UpdateLossScaling: Find infinite grads. --"; + + auto place = dev_ctx.GetPlace(); + auto stream = dev_ctx.stream(); + auto g = out->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + out->numel() * sizeof(T), stream); + } + } + } +}; + +template +class UpdateLossScalingNPUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const auto xs = ctx.MultiInput("X"); + auto outs = ctx.MultiOutput("Out"); + const auto* found_inf = ctx.Input("FoundInfinite"); + PADDLE_ENFORCE_EQ(found_inf->numel(), 1, + platform::errors::InvalidArgument( + "FoundInfinite must has only one element.")); + + std::vector found_inf_vec; + TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec); + + LazyZerosNPU{}(dev_ctx, found_inf_vec, xs, outs); + const bool stop_update = ctx.Attr("stop_update"); + if (stop_update) { + return; + } + + const auto* pre_loss_scaling = ctx.Input("PrevLossScaling"); + const auto* good_in = ctx.Input("InGoodSteps"); + const auto* bad_in = ctx.Input("InBadSteps"); + auto* updated_loss_scaling = ctx.Output("LossScaling"); + auto* good_out = ctx.Output("OutGoodSteps"); + auto* bad_out = ctx.Output("OutBadSteps"); + + updated_loss_scaling->mutable_data(dev_ctx.GetPlace()); + good_out->mutable_data(dev_ctx.GetPlace()); + bad_out->mutable_data(dev_ctx.GetPlace()); + + const int incr_every_n_steps = ctx.Attr("incr_every_n_steps"); + const int decr_every_n_nan_or_inf = + ctx.Attr("decr_every_n_nan_or_inf"); + const float incr_ratio = ctx.Attr("incr_ratio"); + const float decr_ratio = ctx.Attr("decr_ratio"); + UpdateLossScalingFunctor{}( + dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in, + incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio, + updated_loss_scaling, good_out, bad_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + update_loss_scaling, + ops::UpdateLossScalingNPUKernel, + ops::UpdateLossScalingNPUKernel); diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc new file mode 100644 index 0000000000000..93689d5e495f3 --- /dev/null +++ b/paddle/fluid/operators/assign_op_npu.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/assign_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace framework { +class OpDesc; +class Variable; +} // namespace framework +namespace imperative { +class OpBase; +} // namespace imperative +namespace platform { +struct CPUPlace; +struct CUDAPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace paddle { +namespace operators { +template +class AssignNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + assign, ops::AssignNPUKernel, + ops::AssignNPUKernel, + ops::AssignNPUKernel) diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc new file mode 100644 index 0000000000000..792d01a5efe43 --- /dev/null +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(assign); +USE_OP_DEVICE_KERNEL(assign, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + std::vector init; + init.push_back(static_cast(1.0)); + init.push_back(static_cast(2.0)); + init.push_back(static_cast(3.0)); + init.push_back(static_cast(4.0)); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({4}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + auto op = + f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {}); + + op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + ctx.Wait(); + + EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4); + EXPECT_EQ(out_vec[0], static_cast(1.0)); + EXPECT_EQ(out_vec[1], static_cast(2.0)); + EXPECT_EQ(out_vec[2], static_cast(3.0)); + EXPECT_EQ(out_vec[3], static_cast(4.0)); +} + +TEST(assign, NPU_fp32) { + f::Scope scope; + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "assign"); +} diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index 444c24b826b1b..41dc87ac1ba47 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -41,6 +41,83 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; +template +static __global__ void BNForwardInference( + const T *x, const BatchNormParamType *mean, + const BatchNormParamType *variance, const BatchNormParamType *scale, + const BatchNormParamType *bias, const int C, const int N, const int HxW, + const double epsilon, T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int num = N * C * HxW; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType x_sub_mean = + static_cast>(x[i]) - mean[c]; + BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); + y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( + const T *x, const BatchNormParamType *scale, + const BatchNormParamType *bias, const int C, const int N, const int HxW, + const double epsilon, double exponentialAverageFactor, T *y, + BatchNormParamType *mean, BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + int outer_size = C; + int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType variance_val; + __shared__ BatchNormParamType inv_var_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + variance_val = x_square_sum / inner_size - mean_val * mean_val; + inv_var_val = 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val; + save_inv_variance[i] = inv_var_val; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * variance[i]; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; + } + } +} + template class BatchNormKernel : public framework::OpKernel { @@ -80,8 +157,12 @@ class BatchNormKernel auto dtype = platform::CudnnDataType::type; #ifdef PADDLE_WITH_HIP - // HIP do not support compute format of NHWC - auto compute_format = DataLayout::kNCHW; + auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC + : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; #else const bool fast_nhwc_batch_norm = test_mode || @@ -111,14 +192,15 @@ class BatchNormKernel // ------------------- cudnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t data_desc_; - miopenTensorDescriptor_t bn_param_desc_; - miopenBatchNormMode_t mode_; - - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -138,7 +220,8 @@ class BatchNormKernel epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); #ifdef PADDLE_WITH_HIP - mode_ = miopenBNSpatial; +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; @@ -161,14 +244,15 @@ class BatchNormKernel } #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), - const_cast(strides.data()))); - // Note: PERSISTENT not implemented for inference - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// Note: PERSISTENT not implemented for inference +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor( +// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, @@ -226,28 +310,53 @@ class BatchNormKernel C, est_var->dims()[0], est_var->dims())); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationForwardInference( - handle, miopenBNSpatial, - const_cast( - static_cast(CudnnDataType::kOne())), - const_cast( - static_cast(CudnnDataType::kZero())), - data_desc_, - static_cast(transformed_x.template data()), - data_desc_, - static_cast( - transformed_y.template mutable_data(ctx.GetPlace())), - bn_param_desc_, - const_cast(static_cast( - scale->template data>())), - const_cast(static_cast( - bias->template data>())), - const_cast(static_cast( - est_mean->template data>())), - const_cast(static_cast( - est_var->template data>())), - epsilon)); + const int block_size = 256; + const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; + if (compute_format == DataLayout::kNCHW) { + BNForwardInference< + T, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, transformed_y.template data()); + } else { + BNForwardInference< + T, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, transformed_y.template data()); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardInference( +// handle, miopenBNSpatial, +// const_cast( +// static_cast(CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// const_cast(static_cast( +// est_mean->template data>())), +// const_cast(static_cast( +// est_var->template data>())), +// epsilon)); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardInference( @@ -365,34 +474,66 @@ class BatchNormKernel #endif // CUDNN_VERSION_MIN(7, 4, 1) if (!called) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationForwardTraining( - handle, mode_, const_cast(static_cast( - CudnnDataType::kOne())), - const_cast( - static_cast(CudnnDataType::kZero())), - data_desc_, - static_cast(transformed_x.template data()), - data_desc_, - static_cast( - transformed_y.template mutable_data(ctx.GetPlace())), - bn_param_desc_, - const_cast(static_cast( - scale->template data>())), - const_cast(static_cast( - bias->template data>())), - this_factor, - static_cast( - mean_out->template mutable_data>( - ctx.GetPlace())), - static_cast(variance_out->template mutable_data< - BatchNormParamType>(ctx.GetPlace())), - epsilon, - static_cast( - saved_mean->template mutable_data>( - ctx.GetPlace())), - static_cast(saved_variance->template mutable_data< - BatchNormParamType>(ctx.GetPlace())))); + const int num = transformed_x.numel(); + const int block = 256; + const int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(C, max_blocks); + if (compute_format == DataLayout::kNCHW) { + BNForwardTraining< + T, block, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, this_factor, transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } else { + BNForwardTraining< + T, block, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, this_factor, transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardTraining( +// handle, mode_, const_cast(static_cast( +// CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// this_factor, +// static_cast( +// mean_out->template mutable_data>( +// ctx.GetPlace())), +// static_cast(variance_out->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())), +// epsilon, +// static_cast( +// saved_mean->template mutable_data>( +// ctx.GetPlace())), +// static_cast(saved_variance->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())))); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTraining( @@ -423,11 +564,12 @@ class BatchNormKernel ctx, &transformed_y, y); } #ifdef PADDLE_WITH_HIP - // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( @@ -439,7 +581,7 @@ class BatchNormKernel }; template -static __global__ void KeBNBackwardScaleBias( +static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( const T *dy, const T *x, const BatchNormParamType *mean, const BatchNormParamType *variance, const double epsilon, const int N, const int C, const int HxW, BatchNormParamType *dscale, @@ -526,13 +668,97 @@ class InplaceHelper { }; template -static __global__ void BNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *mean, - const T *x, - const BatchNormParamType *variance, - const int C, const int N, const int HxW, - T *dx) { +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( + const T *dy, const T *x, const BatchNormParamType *scale, + const BatchNormParamType *saved_mean, + const BatchNormParamType *saved_inv_variance, const int C, const int N, + const int HxW, const double epsilon, T *dx, BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType inv_var_val; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType dscale_val; + __shared__ BatchNormParamType dbias_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + if (saved_mean && saved_inv_variance) { + if (threadIdx.x == 0) { + inv_var_val = saved_inv_variance[i]; + mean_val = saved_mean[i]; + } + } else { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = + static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = + static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + inv_var_val = + 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); + } + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + ds_sum += + dy_i * (static_cast>(x[index]) - mean_val); + db_sum += dy_i; + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale_val = ds_sum * inv_var_val; + dbias_val = db_sum; + dscale[i] = dscale_val; + dbias[i] = dbias_val; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = scale[i] * inv_var_val * + (static_cast>(dy[index]) - + dbias_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_val) * + inv_var_val * dscale_val / inner_size); + } + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( + const T *dy, const BatchNormParamType *scale, + const BatchNormParamType *mean, const T *x, + const BatchNormParamType *variance, const int C, const int N, + const int HxW, T *dx) { const int outer_size = C; const int inner_size = N * HxW; typedef cub::BlockReduce, BlockDim> BlockReduce; @@ -567,7 +793,6 @@ static __global__ void BNBackwardData(const T *dy, dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; } __syncthreads(); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { const int index = layout == framework::DataLayout::kNCHW ? (j / HxW * C + i) * HxW + j % HxW @@ -668,8 +893,12 @@ class BatchNormGradKernel auto dtype = platform::CudnnDataType::type; const auto *reserve_space = ctx.Input("ReserveSpace"); #ifdef PADDLE_WITH_HIP - // HIP do not support compute format of NHWC - auto compute_format = DataLayout::kNCHW; + auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC + : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; #else const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent && @@ -714,7 +943,11 @@ class BatchNormGradKernel auto &dev_ctx = ctx.template device_context(); const int num = transformed_x.numel(); +#ifdef HIPCC + const int block = 256; +#else const int block = 512; +#endif int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid1 = (num + block - 1) / block; @@ -734,14 +967,15 @@ class BatchNormGradKernel // ------------------- cudnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t data_desc_; - miopenTensorDescriptor_t bn_param_desc_; - miopenBatchNormMode_t mode_; - - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -759,7 +993,8 @@ class BatchNormGradKernel } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); #ifdef PADDLE_WITH_HIP - mode_ = miopenBNSpatial; +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; @@ -771,13 +1006,14 @@ class BatchNormGradKernel #endif // CUDNN_VERSION_MIN(7, 0, 1) #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), - const_cast(strides.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, +// data_desc_, mode_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, @@ -871,20 +1107,49 @@ class BatchNormGradKernel #endif // CUDNN_VERSION_MIN(7, 4, 1) if (!called) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_d_y.template data(), data_desc_, - transformed_d_x.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); + if (compute_format == DataLayout::kNCHW) { + BNBackward< + T, block, + DataLayout::kNCHW><<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale->template data>(), saved_mean_data, + saved_var_data, C, N, H * W * D, epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } else { + BNBackward< + T, block, + DataLayout::kNHWC><<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale->template data>(), saved_mean_data, + saved_var_data, C, N, H * W * D, epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationBackward( +// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), +// CudnnDataType::kZero(), CudnnDataType::kOne(), +// CudnnDataType::kZero(), data_desc_, +// transformed_x.template data(), data_desc_, +// transformed_d_y.template data(), data_desc_, +// transformed_d_x.template mutable_data(ctx.GetPlace()), +// bn_param_desc_, scale->template data>(), +// d_scale->template mutable_data>( +// ctx.GetPlace()), +// d_bias->template mutable_data>( +// ctx.GetPlace()), +// epsilon, saved_mean_data, saved_var_data)); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationBackward( @@ -931,11 +1196,12 @@ class BatchNormGradKernel } #ifdef PADDLE_WITH_HIP - // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu index 99153101fc326..8bd2b7fe2d127 100644 --- a/paddle/fluid/operators/bce_loss_op.cu +++ b/paddle/fluid/operators/bce_loss_op.cu @@ -32,6 +32,11 @@ __global__ void GPUBCELossForward(const T* x_data, const T* label_data, T one = static_cast(1.); T neg_100 = static_cast(-100.); + PADDLE_ENFORCE( + (x >= static_cast(0)) && (x <= one), + "Input is expected to be within the interval [0, 1], but recieved %f.", + x); + T term1 = max(real_log(x), neg_100); T term2 = max(real_log(one - x), neg_100); @@ -64,29 +69,13 @@ class BCELossCUDAKernel : public framework::OpKernel { auto* labels = ctx.Input("Label"); auto* out = ctx.Output("Out"); - auto x_data = x->data(); - auto out_data = out->mutable_data(ctx.GetPlace()); + const auto* x_data = x->data(); + auto* out_data = out->mutable_data(ctx.GetPlace()); auto x_numel = x->numel(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), x_numel); - - Tensor x_cpu; - framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu); - T* x_cpu_data = x_cpu.data(); - - for (int64_t i = 0; i < x_numel; ++i) { - PADDLE_ENFORCE_GE( - x_cpu_data[i], static_cast(0), - platform::errors::InvalidArgument( - "Illegal input, input must be greater than or equal to 0")); - PADDLE_ENFORCE_LE( - x_cpu_data[i], static_cast(1), - platform::errors::InvalidArgument( - "Illegal input, input must be less than or equal to 1")); - } - auto& dev_ctx = ctx.cuda_device_context(); + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(dev_ctx, x_numel); GPUBCELossForward<<>>(x_data, labels->data(), @@ -102,9 +91,10 @@ class BCELossGradCUDAKernel : public framework::OpKernel { auto* labels = ctx.Input("Label"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); - auto dx_data = dx->mutable_data(ctx.GetPlace()); int x_numel = x->numel(); + auto* dx_data = dx->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.cuda_device_context(); platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, x_numel); diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc new file mode 100644 index 0000000000000..0de0f5e450579 --- /dev/null +++ b/paddle/fluid/operators/cast_op_npu.cc @@ -0,0 +1,100 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/cast_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +static std::map + DTYPE_2_ACL_DTYPE = { + {framework::proto::VarType::BOOL, ACL_BOOL}, + {framework::proto::VarType::INT16, ACL_INT16}, + {framework::proto::VarType::INT32, ACL_INT32}, + {framework::proto::VarType::INT64, ACL_INT64}, + {framework::proto::VarType::FP16, ACL_FLOAT16}, + {framework::proto::VarType::FP32, ACL_FLOAT}, + {framework::proto::VarType::FP64, ACL_DOUBLE}, +}; + +using Tensor = framework::Tensor; + +template +class CastNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + int dtype = ctx.Attr("out_dtype"); + auto* out = ctx.Output("Out"); + auto place = ctx.GetPlace(); + + if (x->type() == dtype) { + // NOTE(zhiqiu): NPU cast op may result in wrong value, so + // add special case here. + VLOG(4) << "cast to same dtype:" << dtype; + out->mutable_data(place, x->type()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), out); + return; + } + + auto iter = DTYPE_2_ACL_DTYPE.find( + static_cast(dtype)); + int aclDtype = iter->second; + + if (dtype == framework::proto::VarType::FP32) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::FP16) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT16) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT32) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT64) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::FP64) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::BOOL) { + out->mutable_data(place); + } + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Cast", {*x}, {*out}, + {{"dst_type", static_cast(aclDtype)}}); + runner.Run(stream); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + cast, ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel); diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc index eb27df8a36757..7176a0466bb83 100644 --- a/paddle/fluid/operators/clip_op.cc +++ b/paddle/fluid/operators/clip_op.cc @@ -145,10 +145,14 @@ REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer, ops::ClipDoubleGradOpMaker); REGISTER_OP_CPU_KERNEL( clip, ops::ClipKernel, - ops::ClipKernel); + ops::ClipKernel, + ops::ClipKernel, + ops::ClipKernel); REGISTER_OP_CPU_KERNEL( clip_grad, ops::ClipGradKernel, - ops::ClipGradKernel); + ops::ClipGradKernel, + ops::ClipGradKernel, + ops::ClipGradKernel); REGISTER_OP_VERSION(clip) .AddCheckpoint( diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu index d31b81c13c5cf..fd61e4ea61d4f 100644 --- a/paddle/fluid/operators/clip_op.cu +++ b/paddle/fluid/operators/clip_op.cu @@ -17,8 +17,12 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( clip, ops::ClipKernel, - ops::ClipKernel); + ops::ClipKernel, + ops::ClipKernel, + ops::ClipKernel); REGISTER_OP_CUDA_KERNEL( clip_grad, ops::ClipGradKernel, - ops::ClipGradKernel); + ops::ClipGradKernel, + ops::ClipGradKernel, + ops::ClipGradKernel); diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 977a208d20e78..3f210219608fb 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -11,7 +11,7 @@ foreach(src ${OPS}) set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS}) endforeach() -register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) +register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) if(WITH_NCCL OR WITH_RCCL) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) @@ -19,12 +19,6 @@ if(WITH_NCCL OR WITH_RCCL) op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) endif() -if(WITH_ASCEND) - op_library(gen_nccl_id_op) - op_library(c_gen_nccl_id_op) -endif() - - if(WITH_GLOO) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) endif() @@ -35,5 +29,38 @@ if(WITH_XPU_BKCL) op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) endif() +if(WITH_ASCEND_CL) + cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper) + op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) + op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) +endif() + set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE) set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency") + +if(WITH_ASCEND_CL) + set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper + gen_hccl_id_op op_registry ascend_hccl flags + dynamic_loader dynload_warpctc scope device_context enforce executor) + cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc + DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc + DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc + DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc + DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc + DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc + DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc + DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc + DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc + DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) + cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc + DEPS op_registry elementwise_add_op c_sync_calc_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc index 86f1c28a9dd4f..63b135a74cf4b 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/operators/distributed_ops/allreduce_op.h" +#include "paddle/fluid/operators/collective/allreduce_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc index 9b70f78399026..fe2e491055270 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cu.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/distributed_ops/allreduce_op.h" +#include "paddle/fluid/operators/collective/allreduce_op.h" namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc index 4111a19c5ebc8..c4e779698ccca 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cc @@ -42,6 +42,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor) the allgather result"); AddAttr("ring_id", "(int default 0) communication ring id.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for all gather.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc new file mode 100644 index 0000000000000..e7f05549d9efe --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allgather_op.h" + +#include + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CAllGatherOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + + int ring_id = ctx.Attr("ring_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto place = ctx.GetPlace(); + auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); + int nranks = comm->nranks(); + + framework::DDim out_dims = in->dims(); + out_dims[0] *= nranks; + out->mutable_data(out_dims, place); + + uint64_t send_numel = in->numel(); + void *send_buff = reinterpret_cast(const_cast(in->data())); + void *recv_buff = reinterpret_cast(out->data()); + + aclrtStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + VLOG(3) << "begin hccl allgather, parameter is: " + << ", group is " << group << ", ring_id is " << ring_id + << ", nranks is " << nranks; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather( + send_buff, recv_buff, send_numel, dtype, comm->comm(), + reinterpret_cast(stream))); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_allgather, ops::CAllGatherOpASCENDKernel, + ops::CAllGatherOpASCENDKernel, + ops::CAllGatherOpASCENDKernel, + ops::CAllGatherOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc new file mode 100644 index 0000000000000..4c7dfc4aad7d0 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allgather_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allgather); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_allgather, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + + int num1 = 1; + int num2 = 4; + + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["ring_id"] = 0; + attrs["nranks"] = 2; + + auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size() * 2); + for (uint32_t i = 0; i < out_vec.size() / 2; i++) { + EXPECT_EQ(out_vec[i], 1.0); + } + for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 2.0); + } +} + +TEST(c_allgather, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLAllGatherOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc new file mode 100644 index 0000000000000..4dece4a3721ff --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_max, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc new file mode 100644 index 0000000000000..b7fd2739d5118 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -0,0 +1,188 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allgather_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allreduce_max); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + + int num1 = 100; + int num2 = 100; + + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id * 3); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 4.0); + } +} + +TEST(c_allreduce_max, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLAllReduceOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc similarity index 55% rename from paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc rename to paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc index 9b70f78399026..b0aa51f7cfdfd 100644 --- a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,14 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/distributed_ops/allreduce_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - allreduce, ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel); +REGISTER_OP_XPU_KERNEL(c_allreduce_max, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc new file mode 100644 index 0000000000000..48e1d2eeb58c5 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_min, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc new file mode 100644 index 0000000000000..2f16a89c217da --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_allreduce_min, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 2f56f43d793fa..0eaa377869ef6 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -19,17 +19,31 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/collective_helper.h" +#endif + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif +#if defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" +#endif + #if defined(PADDLE_WITH_GLOO) #include #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/hccl_helper.h" +#endif + namespace paddle { namespace operators { @@ -105,6 +119,135 @@ class CAllReduceOpCPUKernel : public framework::OpKernel { } }; +template +class CAllReduceOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + auto place = ctx.GetPlace(); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + int64_t numel = in->numel(); + + void* sendbuff = reinterpret_cast(const_cast(in->data())); + void* recvbuff = reinterpret_cast(out->data()); + + int ring_id = ctx.Attr("ring_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM; + switch (red_type) { + case kRedSum: + hccl_red_type = HCCL_REDUCE_SUM; + break; + + case kRedMax: + hccl_red_type = HCCL_REDUCE_MAX; + break; + + case kRedMin: + hccl_red_type = HCCL_REDUCE_MIN; + break; + + case kRedProd: + hccl_red_type = HCCL_REDUCE_PROD; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + VLOG(3) << "begin hccl allreduce, parameter is: " + << "input num: " << numel << "dtype: " << dtype + << "hccl_red_type: " << hccl_red_type << ", group is: " << group; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), + reinterpret_cast(stream))); + + out->Resize(in->dims()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +template +class CAllReduceOpXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_XPU_BKCL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + BKCLDataType dtype = platform::ToBKCLDataType(in->type()); + int64_t numel = in->numel(); + const void* sendbuff = in->data(); + out->Resize(in->dims()); + void* recvbuff = out->mutable_data(place); + + int rid = ctx.Attr("ring_id"); + auto comm = platform::BKCLCommContext::Instance().Get(rid, place); + + XPUStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx) + ->x_context() + ->xpu_stream; + } else { + stream = comm->stream(); + } + + BKCLOp bkcl_red_type = BKCL_ADD; + switch (red_type) { + case kRedSum: + bkcl_red_type = BKCL_ADD; + break; + + case kRedMax: + bkcl_red_type = BKCL_MAX; + break; + + case kRedMin: + bkcl_red_type = BKCL_MIN; + break; + + case kRedProd: + bkcl_red_type = BKCL_PRODUCT; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel, + dtype, bkcl_red_type, stream), + BKCL_SUCCESS, platform::errors::PreconditionNotMet( + "BKCL all reduce failed")); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should be compiled with XPU.")); +#endif + } +}; + template class CAllReduceOpCUDAKernel : public framework::OpKernel { public: @@ -170,10 +313,20 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor) the allreduced result."); AddAttr("ring_id", "(int default 0) communication ring id.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for all reduce.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") .SetDefault(false); + AddAttr( + "use_model_parallel", + "(bool default false) use this op with model parallel mode. In model " + "parallel mode, the backward is c_identity which returns itself for " + "c_allreduce_sum.") + .SetDefault(false); AddComment(string::Sprintf(R"DOC( CAllReduce %s Operator diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc new file mode 100644 index 0000000000000..f3d14afe0a1bc --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_prod, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc new file mode 100644 index 0000000000000..92ba00428065b --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_allreduce_prod, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc index 68061e6ae6bea..23ed98bb044be 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc @@ -37,7 +37,12 @@ class CAllReduceSumOpGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr retv) const override { - retv->SetType("c_allreduce_sum"); + bool use_mp = BOOST_GET_CONST(bool, this->GetAttr("use_model_parallel")); + if (use_mp) { + retv->SetType("c_identity"); + } else { + retv->SetType("c_allreduce_sum"); + } retv->SetInput("X", this->OutputGrad("Out")); retv->SetOutput("Out", this->InputGrad("X")); retv->SetAttrMap(this->Attrs()); diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc new file mode 100644 index 0000000000000..b66e2e1968908 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_sum, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc new file mode 100644 index 0000000000000..f1bf9683e3559 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -0,0 +1,189 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allreduce_sum); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(3) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, + int iter) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + int rank_id = atoi(getenv("RANK_ID")); + int num1 = 3; + int num2 = 128; + + std::vector init; + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + auto place = ctx.GetPlace(); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx_" + std::to_string(iter)); + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 3.0); + } +} + +TEST(c_allreduce_sum, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + // only support one device, if more than one device, use first default + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + for (int i = 0; i < 1; i++) { + VLOG(2) << "iter num: " << i; + TestHCCLAllReduceOp(&scope, ctx, i); + } +} diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc new file mode 100644 index 0000000000000..e4ec538cd2323 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_allreduce_sum, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc index 928fa8549ffb9..271d543eb2364 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cc @@ -42,6 +42,10 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(0); AddAttr("root", "(int default 0) root id for broadcasting.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc new file mode 100644 index 0000000000000..a60ba86572822 --- /dev/null +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CBroadcastOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Input("X"); + void* ptr = reinterpret_cast(const_cast(x->data())); + int numel = x->numel(); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); + + auto out = ctx.Output("Out"); + + int ring_id = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int root = ctx.Attr("root"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + + VLOG(3) << "begin hccl broadcast, parameter is: " + << "root " << root << ", group is " << group + << ", comm: " << comm->comm() << ", stream: " << stream; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + ptr, numel, dtype, (uint32_t)root, comm->comm(), stream)); + + VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved " + << framework::product(out->dims()); + + dev_ctx->Wait(); + + if (out != x) { + framework::TensorCopy(*static_cast(x), place, + *platform::DeviceContextPool::Instance().Get(place), + static_cast(out)); + } + dev_ctx->Wait(); + + out->Resize(x->dims()); + out->set_lod(x->lod()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_broadcast, ops::CBroadcastOpASCENDKernel, + ops::CBroadcastOpASCENDKernel, + ops::CBroadcastOpASCENDKernel, + ops::CBroadcastOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc new file mode 100644 index 0000000000000..9e39613f3fbe3 --- /dev/null +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -0,0 +1,181 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_broadcast); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_broadcast, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + int num = 2; + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + + for (int64_t i = 0; i < num * num; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num, num}); + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num, num}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["root"] = 0; + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 1.0); + } +} + +TEST(c_broadcast, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLBroadcastOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc new file mode 100644 index 0000000000000..7817f19bacb18 --- /dev/null +++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#endif + +namespace paddle { +namespace operators { + +class CCommInitOpAscend : public framework::OperatorBase { + public: + CCommInitOpAscend(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "CCommInitOpAscend can run on npu place only.")); + + auto var = scope.FindVar(Input("X")); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::InvalidArgument("Input con not be empty.")); +#if defined(PADDLE_WITH_ASCEND_CL) + HcclRootInfo* hccl_id = var->GetMutable(); + + int rank_ids = Attr("rank_ids"); + int rank_id = Attr("rank"); + int rid = Attr("ring_id"); + int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device; + if (Attr("device_id") >= 0) { + device_id = Attr("device_id"); + } + platform::HCCLCommContext::Instance().CreateHCCLComm( + hccl_id, rank_ids, rank_id, device_id, rid); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Raw variable contains a NCCL UniqueId instaces."); + AddComment(R"DOC( +CCommInit operator + +Initialize collective communicatoin context within this trainer +)DOC"); + AddAttr("rank_ids", + "(int) The number of ranks of distributed trainers"); + AddAttr("rank", + "(int) The rank of the trainer in distributed training."); + AddAttr("device_id", + "(int) The deivce_id on which to initialize the communicator." + "Now, you only have to set this attr manually for pipeline " + "training. Otherwise, make it as default.") + .SetDefault(-1); + AddAttr("ring_id", "(int default 0) user specified ring id") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend, + ops::CCommInitOpAscendMaker); diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc new file mode 100644 index 0000000000000..551fde2116258 --- /dev/null +++ b/paddle/fluid/operators/collective/c_concat_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_concat_op.h" + +namespace paddle { +namespace operators { + +class CConcatOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_concat"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_concat"); + int nranks = ctx->Attrs().Get("nranks"); + int rank = ctx->Attrs().Get("rank"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( + "The number of ranks (%d) for c_concat " + "must be greater than 1.", + nranks)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_concat must be non-negative.", ring_id)); + PADDLE_ENFORCE_GE( + rank, 0, platform::errors::InvalidArgument( + "The rank (%d) for c_concat must be non-negative.", rank)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::InvalidArgument( + "The value of rank (%d) for c_concat must " + "be less than that of nranks.", + rank, nranks)); + + framework::DDim dim = ctx->GetInputDim("X"); + dim[dim.size() - 1] = dim[dim.size() - 1] * nranks; + if (dim[dim.size() - 1] < 0) dim[dim.size() - 1] = -1; + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +template +class CConcatOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("c_split"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; + +class CConcatOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor to be concated."); + AddOutput("Out", "(Tensor) the result of concat."); + AddAttr("rank", "(int default 0) rank id.").SetDefault(0); + AddAttr("nranks", "(int default 1) number of ranks.").SetDefault(1); + AddAttr("ring_id", "(int default 0) ring id.").SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default true) eject CUDA operations to calculation stream.") + .SetDefault(true); + AddAttr("use_model_parallel", + "(bool default true) use this op with model parallel.") + .SetDefault(true); + AddComment(R"DOC( +CConcat Operator +AllGather the tensors on different trainers and concat them along the last dimension. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_concat, ops::CConcatOp, + ops::CConcatOpGradMaker, + ops::CConcatOpGradMaker, + ops::CConcatOpMaker); + +REGISTER_OP_CPU_KERNEL(c_concat, ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc new file mode 100644 index 0000000000000..bfdc49c440aae --- /dev/null +++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/collective/c_concat_op.h" +#include "paddle/fluid/operators/math/concat_and_split.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CConcatOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + ncclDataType_t dtype = platform::ToNCCLDataType(x->type()); + + int nranks = ctx.Attr("nranks"); + int rank = ctx.Attr("rank"); + int rid = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_GE(rank, 0, + platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_concat must be " + "greater than or equal to 0.", + rank)); + PADDLE_ENFORCE_GE(nranks, 2, + platform::errors::PreconditionNotMet( + "The value of nranks (%d) for c_concat must be " + "greater than or equal to 2.", + nranks)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_concat must be " + "less than that of nranks (%d).", + rank, nranks)); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); + PADDLE_ENFORCE_EQ( + nranks, comm->nranks(), + platform::errors::InvalidArgument("nranks: %s should equal to %s", + nranks, comm->nranks())); + + framework::Tensor temp_out; + framework::DDim temp_out_dims = x->dims(); + temp_out_dims[0] *= nranks; + temp_out.mutable_data(temp_out_dims, place); + int64_t send_numel = x->numel(); + const T* send_buff = x->data(); + T* recv_buff = temp_out.data(); + gpuStream_t stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + send_buff, recv_buff, send_numel, static_cast(dtype), + comm->comm(), stream)); + + std::vector inputs; + int axis = x->dims().size() - 1; + auto out_dims = x->dims(); + out_dims[out_dims.size() - 1] *= nranks; + int rows_per_tensor = x->dims()[0]; + int offset = 0; + for (int i = 0; i < nranks; i++) { + framework::Tensor temp = temp_out.Slice(offset, offset + rows_per_tensor); + inputs.emplace_back(temp); + offset += rows_per_tensor; + } + + math::ConcatFunctor functor; + out->mutable_data(out_dims, place); + auto& dev_ctx2 = ctx.template device_context(); + functor(dev_ctx2, inputs, axis, out); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_concat, ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel); diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.h b/paddle/fluid/operators/collective/c_concat_op.h similarity index 51% rename from paddle/fluid/operators/distributed_ops/split_byref_op.h rename to paddle/fluid/operators/collective/c_concat_op.h index fedd7218dd6cc..55a5799e37b6f 100644 --- a/paddle/fluid/operators/distributed_ops/split_byref_op.h +++ b/paddle/fluid/operators/collective/c_concat_op.h @@ -1,10 +1,10 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -14,28 +14,23 @@ limitations under the License. */ #pragma once +#include +#include #include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { -template -class SplitByrefOpKernel : public framework::OpKernel { +template +class CConcatOpCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto outs = ctx.MultiOutput("Out"); - auto place = ctx.GetPlace(); - - size_t row_offset = 0; - for (size_t i = 0; i < outs.size(); ++i) { - // NOTE: no need to call mutable_data here to allocate memory. - auto* out = outs[i]; - VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0]; - *out = in->Slice(row_offset, row_offset + out->dims()[0]); - row_offset += out->dims()[0]; - } + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_concat for cpu kernel now.")); } }; diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc new file mode 100644 index 0000000000000..593eaf923a978 --- /dev/null +++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#endif + +namespace paddle { +namespace operators { + +#ifdef PADDLE_WITH_ASCEND_CL + +class CGenHCCLIdOp : public framework::OperatorBase { + public: + CGenHCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + int rank = Attr("rank"); + framework::Scope& local_scope = scope.NewScope(); + + std::function func = [&](size_t i) -> std::string { + return Output("Out"); + }; + + if (rank == 0) { + std::vector endpoint_list = + Attr>("other_endpoints"); + SendBroadCastHCCLID(endpoint_list, 1, func, local_scope); + } else { + std::string endpoint = Attr("endpoint"); + RecvBroadCastHCCLID(endpoint, 1, func, local_scope); + } + scope.DeleteScope(&local_scope); + } +}; + +#else + +class CGenHCCLIdOp : public framework::OperatorBase { + public: + CGenHCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + +class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + VLOG(3) << "ele"; + AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces."); + AddComment(R"DOC( +CGenHCCLId operator + +For trainer 0: generate a new UniqueId and send it to all the other trainers. +For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server. +)DOC"); + AddAttr("endpoint", + "(string), e.g. 127.0.0.1:6175 " + "current listen endpoint"); + AddAttr>( + "other_endpoints", + "['trainer1_ip:port', 'trainer2_ip:port', ...] " + "list of other trainer endpoints") + .SetDefault({}); + AddAttr("rank", + "(int default 0) " + "The rank of the trainer in distributed training.") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker); diff --git a/paddle/fluid/operators/collective/c_identity_op.cc b/paddle/fluid/operators/collective/c_identity_op.cc new file mode 100644 index 0000000000000..646c27b90e17e --- /dev/null +++ b/paddle/fluid/operators/collective/c_identity_op.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_identity_op.h" + +namespace paddle { +namespace operators { + +class CIdentityOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_identity"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_identity"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_identity must be non-negative.", ring_id)); + framework::DDim dim = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class CIdentityOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) identity tensor."); + AddOutput("Out", "(Tensor) identity tensor."); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default true) eject CUDA operations to calculation stream.") + .SetDefault(true); + AddAttr("use_model_parallel", + "(bool default true) use this op with model parallel.") + .SetDefault(true); + AddComment(R"DOC( +Identity Operator which returns a copy of itself. +)DOC"); + } +}; + +template +class CIdentityOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("c_allreduce_sum"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_identity, ops::CIdentityOp, + ops::CIdentityOpGradMaker, + ops::CIdentityOpGradMaker, + ops::CIdentityOpMaker); + +REGISTER_OP_CPU_KERNEL(c_identity, ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc new file mode 100644 index 0000000000000..8ccf40e317ade --- /dev/null +++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_identity_op.h" + +namespace paddle { +namespace operators { + +template +class CIdentityOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int rid = ctx.Attr("ring_id"); + PADDLE_ENFORCE_GE( + rid, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_identity op must be non-negative.", rid)); + out->mutable_data(ctx.GetPlace()); + + TensorCopy(*x, out->place(), out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpCUDAKernel, + ops::CIdentityOpCUDAKernel, + ops::CIdentityOpCUDAKernel, + ops::CIdentityOpCUDAKernel, + ops::CIdentityOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h new file mode 100644 index 0000000000000..ca817fb6bac0e --- /dev/null +++ b/paddle/fluid/operators/collective/c_identity_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CIdentityOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_identity for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc new file mode 100644 index 0000000000000..f35b4c2f70722 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_max, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/split_selected_rows_op.cu b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc similarity index 59% rename from paddle/fluid/operators/split_selected_rows_op.cu rename to paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc index 7250917036f61..6d3af7bb5f258 100644 --- a/paddle/fluid/operators/split_selected_rows_op.cu +++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,8 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/split_selected_rows_op.h" +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - split_selected_rows, - ops::SplitSelectedRowsOpKernel); +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_max, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc new file mode 100644 index 0000000000000..6ebb7e4c40e68 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_min, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc similarity index 54% rename from paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc rename to paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc index 056659c3ea61f..791e58d8493ce 100644 --- a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc @@ -1,10 +1,10 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -12,8 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/distributed_ops/split_byref_op.h" +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - split_byref, - ops::SplitByrefOpKernel); +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_min, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index 1bce01e13a2ad..fa9fd079d8e48 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -24,15 +24,28 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" +#endif + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif + +#if defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" +#endif + #if defined(PADDLE_WITH_GLOO) #include #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/hccl_helper.h" +#endif + namespace paddle { namespace operators { @@ -110,6 +123,148 @@ class CReduceOpCPUKernel : public framework::OpKernel { } }; +template +class CReduceOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + auto place = ctx.GetPlace(); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + int64_t numel = in->numel(); + + void* sendbuff = reinterpret_cast(const_cast(in->data())); + void* recvbuff = reinterpret_cast(out->data()); + + int ring_id = ctx.Attr("ring_id"); + int root_id = ctx.Attr("root_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int rank_id = comm->rank(); + + HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM; + switch (red_type) { + case kRedSum: + hccl_red_type = HCCL_REDUCE_SUM; + break; + + case kRedMax: + hccl_red_type = HCCL_REDUCE_MAX; + break; + + case kRedMin: + hccl_red_type = HCCL_REDUCE_MIN; + break; + + case kRedProd: + hccl_red_type = HCCL_REDUCE_PROD; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + VLOG(3) << "begin hccl reduce, parameter is: " + << "input num: " << numel << "root_id: " << root_id + << "dtype: " << dtype << "hccl_red_type: " << hccl_red_type + << ", group is: " << group; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), + reinterpret_cast(stream))); + + if (rank_id != root_id) { + auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place); + memory::Copy(npu_place, reinterpret_cast(out->data()), + npu_place, + reinterpret_cast(const_cast(in->data())), + numel * sizeof(T), stream); + } + + out->Resize(in->dims()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +template +class CReduceOpXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_XPU_BKCL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + BKCLDataType dtype = platform::ToBKCLDataType(in->type()); + int64_t numel = in->numel(); + const void* sendbuff = in->data(); + out->Resize(in->dims()); + void* recvbuff = out->mutable_data(place); + + int rid = ctx.Attr("ring_id"); + int root = ctx.Attr("root_id"); + auto comm = platform::BKCLCommContext::Instance().Get(rid, place); + + XPUStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx) + ->x_context() + ->xpu_stream; + } else { + stream = comm->stream(); + } + + BKCLOp bkcl_red_type = BKCL_ADD; + switch (red_type) { + case kRedSum: + bkcl_red_type = BKCL_ADD; + break; + + case kRedMax: + bkcl_red_type = BKCL_MAX; + break; + + case kRedMin: + bkcl_red_type = BKCL_MIN; + break; + + case kRedProd: + bkcl_red_type = BKCL_PRODUCT; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + PADDLE_ENFORCE_EQ(bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel, + dtype, bkcl_red_type, root, stream), + BKCL_SUCCESS, platform::errors::PreconditionNotMet( + "BKCL all reduce failed")); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should be compiled with XPU.")); +#endif + } +}; + template class CReduceOpCUDAKernel : public framework::OpKernel { public: @@ -179,6 +334,10 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor) the reduced result."); AddAttr("ring_id", "(int default 0) communication ring id.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for reduce.") + .SetDefault("tag"); +#endif AddAttr("root_id", "(int default 0) root id.").SetDefault(0); AddAttr( "use_calc_stream", diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc new file mode 100644 index 0000000000000..f0b7021e7997d --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_prod, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc new file mode 100644 index 0000000000000..e7e770e8ffdca --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_prod, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc new file mode 100644 index 0000000000000..dd4dbbd5f3645 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_sum, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc new file mode 100644 index 0000000000000..3683c7722ba3b --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_reduce_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_reduce_sum); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(3) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + int rank_id = atoi(getenv("RANK_ID")); + int num1 = 3; + int num2 = 128; + + std::vector init; + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + auto place = ctx.GetPlace(); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx_" + std::to_string(iter)); + attrs["ring_id"] = 0; + int root_id = 0; + attrs["root_id"] = root_id; + + auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + op->Run(*scope, place); + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + if (rank_id == root_id) { + EXPECT_EQ(out_vec[i], 3.0); + } else { + EXPECT_EQ(out_vec[i], init[i]); + } + } +} + +TEST(c_reduce_sum, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + for (int i = 0; i < 2; i++) { + VLOG(2) << "iter num: " << i; + TestHCCLReduceOp(&scope, ctx, i); + } +} diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc new file mode 100644 index 0000000000000..a0ec4d2a99cd7 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_sum, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc index ada1fd2b1270c..7836f11dc9b1f 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc @@ -49,6 +49,10 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("nranks", "Total trainer count of the distributed training job") .SetDefault(1); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for reduce scatter.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h index 366d8a3747cfb..490b152bc2d30 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.h +++ b/paddle/fluid/operators/collective/c_reducescatter_op.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc new file mode 100644 index 0000000000000..44096a82c34d6 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CReduceScatterOpAscendKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int ring_id = ctx.Attr("ring_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto place = ctx.GetPlace(); + auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); + int nranks = comm->nranks(); + + auto out_dims = in->dims(); + PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0, + platform::errors::InvalidArgument( + "The input tensor X's " + "dim[0] (%d) should be divisible by nranks(%d)", + out_dims[0], nranks)); + + out_dims[0] = out_dims[0] / nranks; + out->mutable_data(out_dims, place); + + uint64_t recv_numel = in->numel() / nranks; + + void* inputPtr = reinterpret_cast(const_cast(in->data())); + void* outputPtr = reinterpret_cast(out->data()); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + + aclrtStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + VLOG(3) << "begin hccl reduce scatter, parameter is: " + << "recv_numel: " << recv_numel << "dtype: " << dtype + << "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter( + inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(), + reinterpret_cast(stream))); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reducescatter, + ops::CReduceScatterOpAscendKernel, + ops::CReduceScatterOpAscendKernel, + ops::CReduceScatterOpAscendKernel, + ops::CReduceScatterOpAscendKernel); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc new file mode 100644 index 0000000000000..f82f050a7206f --- /dev/null +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -0,0 +1,189 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allgather_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_reducescatter); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_reducescatter, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + std::vector init; + int num1 = 4; + int num2 = 1; + + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["ring_id"] = 0; + attrs["nranks"] = 2; + + auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + int iter_num = 10; + for (int i = 0; i < iter_num; i++) { + op->Run(*scope, place); + ctx.Wait(); + } + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + EXPECT_EQ(out_vec.size(), init.size() / 2); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 2.0); + } +} + +TEST(c_reducescatter, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLReduceScatterOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc new file mode 100644 index 0000000000000..03046d571d0f0 --- /dev/null +++ b/paddle/fluid/operators/collective/c_split_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_split_op.h" + +namespace paddle { +namespace operators { + +class CSplitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_split"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_split"); + int nranks = ctx->Attrs().Get("nranks"); + int rank = ctx->Attrs().Get("rank"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( + "The number of ranks (%d) for c_split " + "must be greater than 1.", + nranks)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_split must be non-negative.", ring_id)); + PADDLE_ENFORCE_GE( + rank, 0, platform::errors::InvalidArgument( + "The rank (%d) for c_split must be non-negative.", rank)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::InvalidArgument( + "The value of rank (%d) for c_split must " + "be less than that of nranks.", + rank, nranks)); + + framework::DDim dim = ctx->GetInputDim("X"); + dim[dim.size() - 1] = dim[dim.size() - 1] / nranks; + if (dim[0] < 0) dim[0] = -1; + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +template +class CSplitOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("c_allgather"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; + +class CSplitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor to be split."); + AddOutput("Out", "(Tensor) the result of split."); + AddAttr("rank", "(int default 0) rank id.").SetDefault(0); + AddAttr("nranks", "(int default 1) number of ranks.").SetDefault(1); + AddAttr("ring_id", "(int default 0) ring id.").SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddAttr("use_model_parallel", + "(bool default false) use this op with model parallel.") + .SetDefault(true); + AddComment(R"DOC( +CSplit Operator +Split the tensor evenly according to its rank. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_split, ops::CSplitOp, + ops::CSplitOpGradMaker, + ops::CSplitOpGradMaker, + ops::CSplitOpMaker); + +REGISTER_OP_CPU_KERNEL(c_split, ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_split_op.cu.cc b/paddle/fluid/operators/collective/c_split_op.cu.cc new file mode 100644 index 0000000000000..92a7f5e41b1d2 --- /dev/null +++ b/paddle/fluid/operators/collective/c_split_op.cu.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/collective/c_split_op.h" +#include "paddle/fluid/operators/math/concat_and_split.h" + +namespace paddle { +namespace operators { + +template +class CSplitOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int nranks = ctx.Attr("nranks"); + int rank = ctx.Attr("rank"); + auto place = ctx.GetPlace(); + + PADDLE_ENFORCE_GE(rank, 0, platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_split must be " + "greater than or equal to 0.", + rank)); + PADDLE_ENFORCE_GE(nranks, 2, + platform::errors::PreconditionNotMet( + "The value of nranks (%d) for c_split must be " + "greater than or equal to 2.", + nranks)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_split must be " + "less than that of nranks (%d).", + rank, nranks)); + + auto& dev_ctx = ctx.template device_context(); + std::vector shape_refer; + std::vector results; + size_t numel = x->numel(); + auto dims = x->dims(); + numel /= nranks; + int axis = dims.size() - 1; + dims[dims.size() - 1] /= nranks; + for (int i = 0; i < nranks; i++) { + framework::Tensor* out = new framework::Tensor(); + out->mutable_data(dims, place); + shape_refer.emplace_back(out); + results.emplace_back(out); + } + + math::SplitFunctor functor; + functor(dev_ctx, *x, shape_refer, axis, &results); + out->mutable_data(dims, place); + paddle::framework::TensorCopySync(*results[rank], out->place(), out); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_split, ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_split_op.h b/paddle/fluid/operators/collective/c_split_op.h new file mode 100644 index 0000000000000..ea0c7fc45c66b --- /dev/null +++ b/paddle/fluid/operators/collective/c_split_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CSplitOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_split for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc index 700d1173e2ff6..83da712bee908 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -61,6 +61,16 @@ class CSyncCalcStreamCudaKernel : public framework::OpKernel { PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); #endif +#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32) + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "Sync stream op can run on npu place only for now.")); + + auto dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(dev_ctx->stream())); + #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU.")); diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc new file mode 100644 index 0000000000000..4b1f7bb340178 --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(elementwise_add); +USE_OP_DEVICE_KERNEL(elementwise_add, NPU); +USE_NO_KERNEL_OP(c_sync_calc_stream); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + auto y = scope->Var("Y"); + auto tensor_y = y->GetMutable(); + + std::vector init_x; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_x.push_back(static_cast(1.0)); + } + + std::vector init_y; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_y.push_back(static_cast(2.0)); + } + + TensorFromVector(init_x, ctx, tensor_x); + tensor_x->Resize({10, 10}); + TensorFromVector(init_y, ctx, tensor_y); + tensor_y->Resize({10, 10}); + + f::AttributeMap attrs; + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + // sync data + auto sync_op0 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + sync_op0->Run(*scope, place); + + // run + + auto op = + f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}}, + {{"Out", {"Out"}}}, attrs); + + op->Run(*scope, place); + + // sync op run + auto sync_op = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + sync_op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + // sync op copy + auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + sync_op2->Run(*scope, place); + + float expected = 3.0; + + EXPECT_EQ(out_vec.size(), init_x.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], static_cast(expected)); + } +} + +TEST(c_sync_calc_stream, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc index 95b9cd040fe94..e6f6bf5345619 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -19,6 +19,11 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + namespace paddle { namespace operators { @@ -56,9 +61,8 @@ template class CSyncCommStreamCudaKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto place = ctx.GetPlace(); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) int ring_id = ctx.Attr("ring_id"); auto stream = @@ -70,6 +74,16 @@ class CSyncCommStreamCudaKernel : public framework::OpKernel { PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); #endif +#elif defined(PADDLE_WITH_ASCEND_CL) + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "Sync stream op can run on npu place only for now.")); + int ring_id = ctx.Attr("ring_id"); + auto stream = + platform::HCCLCommContext::Instance().Get(ring_id, place)->stream(); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream)); + #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU.")); diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc new file mode 100644 index 0000000000000..3915ec4fa35e8 --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -0,0 +1,190 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_broadcast); +USE_NO_KERNEL_OP(c_sync_comm_stream); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_broadcast, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { + std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + int num = 2; + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + std::cout << "rank_id:" << rank_id << std::endl; + for (int64_t i = 0; i < num * num; ++i) { + init.push_back(1.0 + rank_id); + std::cout << init[0]; + } + std::cout << std::endl; + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num, num}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num, num}); + tensor_out->mutable_data(place); // allocate + + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["root"] = 0; + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + op->Run(*scope, place); + + // comm sync + + auto sync_op = f::OpRegistry::CreateOp( + "c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); + sync_op->Run(*scope, place); + + // ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 1.0); + } +} + +TEST(c_sync_comm_stream_op, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLBroadcastOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc new file mode 100644 index 0000000000000..0cb2dd188725f --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc @@ -0,0 +1,216 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/hccl_helper.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +namespace paddle { +namespace operators { + +#ifdef PADDLE_WITH_ASCEND_CL + +class GenHCCLIdOp : public framework::OperatorBase { + public: + GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + std::vector trainers = + Attr>("trainers"); + int trainer_id = Attr("trainer_id"); + std::string endpoint = trainers[trainer_id]; + + PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument( + "trainer_id %d is less than 0. Its " + "valid range is [0, trainer_size)")); + PADDLE_ENFORCE_LT( + trainer_id, static_cast(trainers.size()), + platform::errors::OutOfRange("trainer_id %d is out of range. Its valid " + "range is [0, trainer_size)", + trainer_id)); + + int hccl_comm_num = Attr("hccl_comm_num"); + int use_hierarchical_allreduce = Attr("use_hierarchical_allreduce"); + int inter_nranks = Attr("hierarchical_allreduce_inter_nranks"); + int inter_trainer_id = -1; + int exter_trainer_id = -1; + + if (use_hierarchical_allreduce) { + PADDLE_ENFORCE_GT( + trainers.size(), 1, + platform::errors::PreconditionNotMet( + "The number of collective trainers %llu <= 1", trainers.size())); + PADDLE_ENFORCE_GT( + inter_nranks, 1, + platform::errors::PreconditionNotMet( + "inter_nranks %d <= 1 while in hierarchical allreduce mode", + inter_nranks)); + PADDLE_ENFORCE_EQ( + trainers.size() % inter_nranks, 0, + platform::errors::PreconditionNotMet( + "The number of trainers %llu mod inter_nranks %d is not equal 0", + trainers.size(), inter_nranks)); + + inter_trainer_id = trainer_id % inter_nranks; + + if (trainer_id % inter_nranks == 0) { + exter_trainer_id = trainer_id / inter_nranks; + } + } + + std::ostringstream ss; + for (size_t i = 0; i < trainers.size(); i++) { + ss << trainers[i] << ","; + } + + VLOG(1) << "trainer_id:" << trainer_id + << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce + << ", hccl_comm_num:" << hccl_comm_num + << ", inter_nranks:" << inter_nranks + << ", inter_trainer_id:" << inter_trainer_id + << ", exter_trainer_id:" << exter_trainer_id + << ", trainers:" << ss.str(); + + int server_fd = -1; + + /// 1. init flat + std::function func = platform::GetFlatHCCLVarName; + if (trainer_id == 0) { + // server endpoints + std::vector flat_endpoints; + flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1, + trainers.end()); + SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope); + } else { + server_fd = CreateListenSocket(endpoint); + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + /// 2. hierarchical inter ncclid + func = platform::GetHierarchicalInterHCCLVarName; + if (inter_trainer_id == 0) { + std::ostringstream ss; + ss << endpoint; + std::vector inter_endpoints; + for (int i = trainer_id + 1; i < trainer_id + inter_nranks && + i < static_cast(trainers.size()); + i++) { + ss << ","; + inter_endpoints.push_back(trainers[i]); + ss << trainers[i]; + } + VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str(); + + SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope); + } else if (inter_trainer_id > 0) { + VLOG(1) << "Hierarchical inter ring"; + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + /// 3. hierarchical exter ncclid + func = platform::GetHierarchicalExterHCCLVarName; + if (exter_trainer_id == 0) { + std::ostringstream ss; + std::vector exter_endpoints; + ss << endpoint; + for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) { + ss << ","; + exter_endpoints.push_back(trainers[i]); + ss << trainers[i]; + } + VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str(); + + SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope); + } else if (exter_trainer_id > 0) { + VLOG(1) << "Hierarchical exter ring"; + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + // close socket server + if (trainer_id != 0) { + CloseSocket(server_fd); + } + } +}; + +#else +class GenHCCLIdOp : public framework::OperatorBase { + public: + GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + +class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces."); + AddComment(R"DOC( +GenHCCLId operator + +For trainer 0: generate a new UniqueId and send it to all the other trainers. +For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server. +)DOC"); + AddAttr>( + "trainers", + "['trainer0_ip:port', 'trainer1_ip:port', ...] " + "list of all trainer endpoints") + .SetDefault({}); + AddAttr("trainer_id", + "(int) " + "The index of the trainer in distributed training."); + AddAttr("hccl_comm_num", + "(int default 1) " + "The number of nccl communicator num.") + .SetDefault(1); + AddAttr("use_hierarchical_allreduce", + "(bool default false) " + "Wheter to use hierarchical allreduce.") + .SetDefault(false); + AddAttr("hierarchical_allreduce_inter_nranks", + "(int default 1) " + "Wheter to use hierarchical allreduce.") + .SetDefault(-1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker); diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc new file mode 100644 index 0000000000000..15940a76f7110 --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc @@ -0,0 +1,350 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/split.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +constexpr char COMM_HEAD[] = "_pd_gen_comm_id_"; +#define HCCL_UNIQUE_ID_BYTES 1024 + +// Check system calls, such as socket, bind. +#define CHECK_SYS_CALL(call, name) \ + do { \ + int retval; \ + CHECK_SYS_CALL_VAL(call, name, retval); \ + } while (false) + +#define CHECK_SYS_CALL_VAL(call, name, retval) \ + do { \ + RETRY_SYS_CALL_VAL(call, name, retval); \ + if (retval == -1) { \ + PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \ + name, strerror(errno))); \ + } \ + } while (false) + +#define RETRY_SYS_CALL_VAL(call, name, retval) \ + do { \ + retval = (call); \ + if (retval == -1 && \ + (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \ + << " retry"; \ + } else { \ + break; \ + } \ + } while (true) + +static int SocketSend(int fd, const char* buffer, int size) { + int offset = 0; + int bytes = 0; + while (offset < size) { + bytes = send(fd, buffer + offset, size - offset, 0); + if (bytes == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + // send failed + return -1; + } else { + bytes = 0; + } + } + offset += bytes; + } + return offset; +} + +static int SocketRecv(int fd, char* buffer, int size) { + int offset = 0; + int bytes = 0; + while (offset < size) { + bytes = recv(fd, buffer + offset, size - offset, 0); + if (bytes == 0) { + // closed by client, maybe probing alive client + return 0; + } + if (bytes == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + return -1; + } else { + bytes = 0; + } + } + offset += bytes; + } + return offset; +} + +static void BindOrConnectFailed(int timeout, int* try_times, int* total_time, + const char* op, const std::string& ep) { + PADDLE_ENFORCE_LT( + *total_time, timeout, + platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op, + ep.c_str(), strerror(errno))); + ++(*try_times); + int retry_time = std::min(*try_times * 500, 3000); // max 3 seconds + *total_time += retry_time; + + LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times + << " times with reason: " << strerror(errno) << " retry after " + << retry_time / 1000.0 << " seconds"; + std::this_thread::sleep_for(std::chrono::milliseconds(retry_time)); +} + +int CreateListenSocket(const std::string& ep) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ( + addr.size(), 2UL, + platform::errors::InvalidArgument( + "The endpoint should contain host and port, but got %s.", ep)); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + + // creating socket fd + int server_fd = -1; + CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd); + + // NOTE. Solutions to `Address already in use`. + // 1. Reuse addr&port. Otherwise, once the server closes the socket + // before client, the server will enter TIME-WAIT status. If we bind port + // again, the error `Address already in use` will appear. + // 2. Or we can close the client first to ensure that the server does + // not enter the TIME-WAIT state. But this is obviously not as convenient + // as the reuse method. + int opt = 1; +#if defined(SO_REUSEPORT) + // since Linux kernel 3.9 + CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, + &opt, sizeof(opt)), + "setsockopt"); +#else + CHECK_SYS_CALL( + setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), + "setsockopt"); +#endif + + struct sockaddr_in address; + address.sin_family = AF_INET; + address.sin_addr.s_addr = INADDR_ANY; + address.sin_port = htons(port); + + // TODO(wangxi) Set from env, default 900s=15min + int timeout = 900 * 1000; + int try_times = 0; + int total_time = 0; + while (true) { + int ret_val = -1; + RETRY_SYS_CALL_VAL( + bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind", + ret_val); + + if (ret_val == -1) { + BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep); + continue; + } + break; + } + + CHECK_SYS_CALL(listen(server_fd, 3), "listen"); + LOG(INFO) << "Server listening on: " << ep << " successful."; + return server_fd; +} + +void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); } + +static int SocketAccept(int server_fd, const char* head) { + struct sockaddr_in client_addr; + socklen_t addr_length = sizeof(client_addr); + char buffer[1024] = {0}; + int conn = -1; + + while (true) { + CHECK_SYS_CALL_VAL( + accept(server_fd, reinterpret_cast(&client_addr), + &addr_length), + "accept", conn); + + int ret_val = SocketRecv(conn, buffer, strlen(head)); + if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) { + break; // accept client + } else { + VLOG(3) << "socket read failed with ret_val=" << ret_val; + CloseSocket(conn); + } + } + return conn; +} + +static int ConnectAddr(const std::string& ep, const char* head) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ( + addr.size(), 2UL, + platform::errors::InvalidArgument( + "The endpoint should contain host and port, but got %s.", ep)); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + + int sock = -1; + CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock); + + struct sockaddr_in server_addr; + memset(&server_addr, 0, sizeof(server_addr)); + server_addr.sin_family = AF_INET; + server_addr.sin_port = htons(port); + + char* ip = NULL; + struct hostent* hp = NULL; + hp = gethostbyname(host.c_str()); + PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument( + "Fail to get host by name %s.", host)); + + int i = 0; + while (hp->h_addr_list[i] != NULL) { + ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]); + VLOG(3) << "gethostbyname host:" << host << " ->ip: " << ip; + break; + } + + PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0, + platform::errors::Unavailable("Open address %s failed: %s", + ep, strerror(errno))); + + // TODO(wangxi) Set from env, default 900s=15min + int timeout = 900 * 1000; + int try_times = 0; + int total_time = 0; + while (true) { + int ret_val = -1; + RETRY_SYS_CALL_VAL( + connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)), + "connect", ret_val); + + if (ret_val == -1) { + BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep); + continue; + } + + CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send"); + break; + } + return sock; +} + +static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) { + char buffer[1024] = {0}; + static_assert(HCCL_UNIQUE_ID_BYTES <= 1024, + "hccl id bytes must <= buffer size"); + + CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES), + "recv hccl id"); + memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES); +} + +static void SendHCCLID(int conn, HcclRootInfo* hccl_id) { + char buffer[1024] = {0}; + memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES); + + CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES), + "send hccl id"); +} + +void SendBroadCastHCCLID(std::vector servers, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + // connect with server + std::vector connects; + for (auto server : servers) { + VLOG(3) << "connecting endpoint: " << server; + int conn = ConnectAddr(server, COMM_HEAD); + connects.push_back(conn); + } + VLOG(3) << "connecting completed..."; + + for (int i = 0; i < hccl_comm_num; ++i) { + std::string var_name = func(i); + auto var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable with name %s is not found", + var_name.c_str())); + auto hccl_id = var->GetMutable(); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id)); + + int j = 0; + for (auto conn : connects) { + VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j] + << " hccl_comm_no: " << i; + SendHCCLID(conn, hccl_id); + ++j; + } + VLOG(3) << "sending completed..."; + } + + // close client + for (auto conn : connects) { + CloseSocket(conn); + } +} + +void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + int server = CreateListenSocket(endpoint); + RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope); + CloseSocket(server); +} + +void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + int client = SocketAccept(server_fd, COMM_HEAD); + + for (int i = 0; i < hccl_comm_num; ++i) { + std::string var_name = func(i); + auto var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable with name %s is not found", + var_name.c_str())); + auto hccl_id = var->GetMutable(); + + VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name + << " from trainer 0, hccl_comm_no: " << i; + RecvHCCLID(client, hccl_id); + } + VLOG(3) << "receiving completed..."; + CloseSocket(client); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h new file mode 100644 index 0000000000000..1ad6f791e1fc3 --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { + +int CreateListenSocket(const std::string& ep); + +void CloseSocket(int fd); + +void SendBroadCastHCCLID(std::vector servers, int nccl_comm_num, + std::function func, + const framework::Scope& scope); + +// server listen on endpoint, then recv nccl id +void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num, + std::function func, + const framework::Scope& scope); + +// recv nccl id from socket +void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num, + std::function func, + const framework::Scope& scope); +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc index 0ae7b821617f9..39a9ed0c74ef5 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cc @@ -70,6 +70,12 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker { AddAttr("peer", "(int default 0) rank id for sender.").SetDefault(0); AddAttr("dtype", "(int default 5('float32')) data type of tensor.") .SetDefault(5); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); + AddAttr("srTag", "(string default tag) tag for broadcasting.") + .SetDefault(0); +#endif AddAttr>("out_shape", "shape of the output tensor.") .SetDefault(std::vector()); AddAttr( diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc new file mode 100644 index 0000000000000..69f1f4681a33d --- /dev/null +++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/recv_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CRecvOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Output("Out"); + void* ptr = reinterpret_cast(const_cast(x->data())); + int numel = x->numel(); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); + + int ring_id = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int nranks = comm->nranks(); + int peer = ctx.Attr("peer"); + + PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", nranks)); + + int root = peer; + + VLOG(3) << "begin hccl recv, parameter is: " + << "root " << root << ", comm: " << comm->comm() + << ", stream: " << stream; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + ptr, numel, dtype, (uint32_t)root, comm->comm(), stream)); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(recv_v2, ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc new file mode 100644 index 0000000000000..384dfd1fc5f2d --- /dev/null +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/operators/collective/recv_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(recv_v2); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(recv_v2, NPU); + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) { + std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; + + int num = atoi(getenv("DATA_SIZE")); + EXPECT_GT(num, 0); + EXPECT_LT(num, 1 << 15); + int rank_id = atoi(getenv("RANK_ID")); + VLOG(3) << "rank_id:" << rank_id << std::endl; + + ctx.Wait(); + auto place = ctx.GetPlace(); + auto out = scope->Var("Data"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num, num}); + tensor_out->mutable_data(place); // allocate + + ctx.Wait(); + + f::AttributeMap attrs; + attrs["tag"] = std::string("srtest"); + attrs["peer"] = atoi(getenv("SRC_RANK")); + attrs["ring_id"] = 0; + attrs["srTag"] = 0; + std::vector out_shape; + out_shape.push_back(num); + out_shape.push_back(num); + attrs["out_shape"] = out_shape; + + auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs); + VLOG(3) << "CreateOp recv_v2"; + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + VLOG(3) << "Run op recv_v2"; + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + std::vector init(num * num, 1.0 * atoi(getenv("DEST_RANK"))); + EXPECT_EQ(out_vec == init, true); +} + +TEST(recv_v2, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + char* npu_id = getenv("FLAGS_selected_npus"); + VLOG(3) << "Select npu:" << npu_id; + p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHcomRecvOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc index c5a86b4f08813..c60d560e43bae 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cc @@ -50,6 +50,12 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker { AddAttr("ring_id", "(int default 0) nccl communication ring id.") .SetDefault(0); AddAttr("peer", "(int default 0) rank id for receiver.").SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); + AddAttr("srTag", "(string default tag) tag for broadcasting.") + .SetDefault(0); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc new file mode 100644 index 0000000000000..0ade090fcaac0 --- /dev/null +++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/send_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CSendOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Input("X"); + void* ptr = reinterpret_cast(const_cast(x->data())); + int numel = x->numel(); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); + + int ring_id = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int nranks = comm->nranks(); + int rank = comm->rank(); + + PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", nranks)); + + int root = rank; + + VLOG(3) << "begin hccl send, parameter is: " + << "root " << root << ", comm: " << comm->comm() + << ", stream: " << stream; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + ptr, numel, dtype, (uint32_t)root, comm->comm(), stream)); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(send_v2, ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc new file mode 100644 index 0000000000000..cf01b1d0a6a1d --- /dev/null +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/operators/collective/send_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(send_v2); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(send_v2, NPU); + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) { + std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + int num = atoi(getenv("DATA_SIZE")); + + EXPECT_GT(num, 0); + EXPECT_LT(num, 1 << 15); + std::vector init(num * num, 1.0 * atoi(getenv("DEST_RANK"))); + int rank_id = atoi(getenv("RANK_ID")); + VLOG(3) << "rank id:" << rank_id; + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num, num}); + ctx.Wait(); + auto place = ctx.GetPlace(); + ctx.Wait(); + + f::AttributeMap attrs; + attrs["tag"] = std::string("srtest"); + attrs["peer"] = atoi(getenv("DEST_RANK")); + attrs["ring_id"] = 0; + attrs["srTag"] = 0; + + auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + VLOG(3) << "send run over"; + ctx.Wait(); +} + +TEST(send_v2, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + char* npu_id = getenv("FLAGS_selected_npus"); + VLOG(3) << "Select npu:" << npu_id; + p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHcomSendOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc new file mode 100644 index 0000000000000..87bb3397ca267 --- /dev/null +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class ConcatNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + framework::LoDTensor* out = ctx.Output("Out"); + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + auto axis = ctx.Attr("axis"); + + if (ctx.HasInput("AxisTensor")) { + PADDLE_THROW(platform::errors::NotFound( + "The AxisTensor is not supported on NPU now.")); + } + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + + auto place = ctx.GetPlace(); + out->mutable_data(place); + + std::vector inputs; + std::vector names; + for (size_t i = 0; i < ins.size(); ++i) { + if (ins[i] && ins[i]->numel() > 0) { + inputs.push_back(*ins[i]); + names.push_back("x" + std::to_string(i)); + } else { + continue; + } + } + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner( + "ConcatD", {inputs}, {*out}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}); + runner.AddInputNames(names); + runner.Run(stream); + } +}; + +template +class ConcatGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); + auto outs = + ctx.MultiOutput(framework::GradVarName("X")); + + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + + auto axis = ctx.Attr("axis"); + + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + + int offset = 0; + auto stream = + ctx.template device_context() + .stream(); + for (size_t j = 0; j < outs.size(); ++j) { + // For stop gradient + // get output tensor that the name is not kEmptyVarName + if (out_var_names[j] != framework::kEmptyVarName && + outs[j]->numel() != 0UL) { + outs[j]->mutable_data(ctx.GetPlace()); + std::vector offsets; + std::vector sizes; + for (int dim = 0; dim < ins[j]->dims().size(); ++dim) { + if (dim == axis) { + offsets.push_back(offset); + sizes.push_back(ins[j]->dims()[dim]); + } else { + offsets.push_back(0); + sizes.push_back(ins[j]->dims()[dim]); + } + } + auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]}, + {{"offsets", offsets}, {"size", sizes}}); + runner.Run(stream); + } + if (ins[j]->numel() != 0UL) { + offset += ins[j]->dims()[axis]; + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel, + ops::ConcatNPUKernel, + ops::ConcatNPUKernel); + +REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel, + ops::ConcatGradNPUKernel, + ops::ConcatGradNPUKernel); diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 3cad86d96c26a..bf047de86fc21 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -23,29 +23,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -class CompareOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - int axis = context.Attr("axis"); - - if (x->numel() == 1 && y->numel() == 1) { - bool* z_data = z->mutable_data(context.GetPlace()); - z_data[0] = Functor()(x->data()[0], y->data()[0]); - } else { - ElementwiseComputeEx( - context, x, y, axis, Functor(), z); - } - } -}; - template class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: @@ -153,16 +130,22 @@ class CompareOp : public framework::OperatorWithKernel { REGISTER_COMPARE_OP_VERSION(op_type); REGISTER_COMPARE_OP(less_than, "Out = X < Y"); -REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor); +REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor, + paddle::operators::GreaterEqualFunctor); REGISTER_COMPARE_OP(less_equal, "Out = X <= Y"); -REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor); +REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor, + paddle::operators::GreaterThanFunctor); REGISTER_COMPARE_OP(greater_than, "Out = X > Y"); REGISTER_COMPARE_KERNEL(greater_than, CPU, - paddle::operators::GreaterThanFunctor); + paddle::operators::GreaterThanFunctor, + paddle::operators::LessEqualFunctor); REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y"); REGISTER_COMPARE_KERNEL(greater_equal, CPU, - paddle::operators::GreaterEqualFunctor); + paddle::operators::GreaterEqualFunctor, + paddle::operators::LessThanFunctor); REGISTER_COMPARE_OP(equal, "Out = X == Y"); -REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor); +REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor, + paddle::operators::EqualFunctor); REGISTER_COMPARE_OP(not_equal, "Out = X != Y"); -REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor); +REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor, + paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu index b1f3063583597..3ca700e16e6e7 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cu +++ b/paddle/fluid/operators/controlflow/compare_op.cu @@ -14,11 +14,17 @@ limitations under the License. */ #include "paddle/fluid/operators/controlflow/compare_op.h" -REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor); -REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor); -REGISTER_COMPARE_KERNEL(greater_than, CUDA, +REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor, + paddle::operators::GreaterEqualFunctor); +REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor, paddle::operators::GreaterThanFunctor); +REGISTER_COMPARE_KERNEL(greater_than, CUDA, + paddle::operators::GreaterThanFunctor, + paddle::operators::LessEqualFunctor); REGISTER_COMPARE_KERNEL(greater_equal, CUDA, - paddle::operators::GreaterEqualFunctor); -REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor); -REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor); + paddle::operators::GreaterEqualFunctor, + paddle::operators::LessThanFunctor); +REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor, + paddle::operators::EqualFunctor); +REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor, + paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h index b7529e4ae632d..ff929ee7dfce7 100644 --- a/paddle/fluid/operators/controlflow/compare_op.h +++ b/paddle/fluid/operators/controlflow/compare_op.h @@ -68,7 +68,7 @@ struct NotEqualFunctor { } }; -template +template class CompareOpKernel : public framework::OpKernel { public: @@ -80,21 +80,33 @@ class CompareOpKernel auto* y = context.Input("Y"); auto* z = context.Output("Out"); int axis = context.Attr("axis"); - ElementwiseComputeEx(context, x, y, axis, - Functor(), z); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + if (x_dims.size() >= y_dims.size()) { + ElementwiseComputeEx(context, x, y, axis, + Functor(), z); + } else { + ElementwiseComputeEx( + context, x, y, axis, InverseFunctor(), z); + } } }; } // namespace operators } // namespace paddle -#define REGISTER_COMPARE_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>); +#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor) \ + REGISTER_OP_##dev##_KERNEL(op_type, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>); diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc new file mode 100644 index 0000000000000..591fb55936734 --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#ifdef PADDLE_WITH_ASCEND_CL + +namespace paddle { +namespace operators { + +template +class EqualNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class LessThanNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + // int axis = context.Attr("axis"); + z->mutable_data(ctx.GetPlace()); // allocate + auto runner = NpuOpRunner("Less", {*x, *y}, {*z}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(equal, ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel); + +REGISTER_OP_NPU_KERNEL( + less_than, + ops::LessThanNPUKernel, + ops::LessThanNPUKernel); + +#endif diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index b9ea2ade6cb90..6513bae839e98 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -78,6 +78,13 @@ class ConditionalOp : public framework::OperatorBase { framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); res = cpu_tensor.data()[0]; +#endif + } else if (platform::is_npu_place(ips[0]->place())) { +#ifdef PADDLE_WITH_ASCEND_CL + framework::LoDTensor cpu_tensor; + framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); + platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); + res = cpu_tensor.data()[0]; #endif } else { res = ips[0]->data()[0]; diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index d86b6b48422d9..fdd1b776bd8fa 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -44,6 +44,11 @@ static void DataCopy(const framework::LoDTensor &src_item, TensorCopySync(src_item, platform::CPUPlace(), dst_item); } #else +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(src_item.place())) { + platform::DeviceContextPool::Instance().Get(src_item.place())->Wait(); + } +#endif TensorCopySync(src_item, platform::CPUPlace(), dst_item); #endif } else { diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc new file mode 100644 index 0000000000000..1b0c0e444347a --- /dev/null +++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/controlflow/logical_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class LogicalNotNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + logical_not, + ops::LogicalNotNPUKernel); + +#endif diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index 39e9d37ddc6c7..ab535e341f757 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -1363,7 +1363,14 @@ REGISTER_OP_KERNEL( conv2d_grad_grad, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvDoubleGradOpKernel, paddle::operators::CUDNNConvDoubleGradOpKernel); - +// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue +// Use depthwise_conv2d in MIOPEN to resolve this issue +REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); REGISTER_OP_CUDA_KERNEL( depthwise_conv2d_grad_grad, paddle::operators::CUDNNConvDoubleGradOpKernel, diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index a712d31cf7e2c..c4cd5854c0f78 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -490,10 +490,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { bool deterministic = FLAGS_cudnn_deterministic; T* input_grad_data = nullptr; T* filter_grad_data = nullptr; - if (input_grad) - input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - if (filter_grad) - filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); if (input_grad) { input_grad_data = input_grad->mutable_data(ctx.GetPlace()); @@ -884,7 +880,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { int iwo_group = groups; int c_group = 1; -#if CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) iwo_group = 1; c_group = groups; groups = 1; @@ -948,7 +944,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args1.idesc.set(transformed_ddO_channel, iwo_group); args1.wdesc.set(*W, layout, iwo_group); args1.odesc.set(transformed_ddX, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args1.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); @@ -967,7 +964,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args2.idesc.set(transformed_ddO_channel, iwo_group); args2.wdesc.set(*ddW, layout, iwo_group); args2.odesc.set(transformed_X, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args2.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; workspace_size = @@ -991,7 +989,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args3.odesc.set(transformed_ddX_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args3.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search3 = SearchAlgorithm; workspace_size = @@ -1013,7 +1012,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args4.idesc.set(transformed_dO, iwo_group); args4.wdesc.set(*ddW, layout, iwo_group); args4.odesc.set(transformed_dX_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args4.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search4 = SearchAlgorithm; workspace_size = @@ -1083,6 +1083,10 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { if (ddW) { for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + Tensor conv_x_ddw(dO->type()); + conv_x_ddw.Resize(transformed_ddO_channel.dims()); + T* conv_x_ddw_data = conv_x_ddw.mutable_data(ctx.GetPlace()); wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1090,11 +1094,17 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { handle, &alpha, args2.odesc.desc(), x + i * group_offset_in, args2.wdesc.desc(), ddw + i * group_offset_filter, args2.cdesc.desc(), - bwd_algo2, &alpha, args2.idesc.desc(), - transformed_ddy_channel + i * group_offset_out, - workspace_ptr, workspace_size)); + bwd_algo2, &beta, args2.idesc.desc(), + conv_x_ddw_data + i * group_offset_out, workspace_ptr, + workspace_size)); }, workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(), + transformed_ddy_channel + i * group_offset_out, &alpha, + args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta, + args2.idesc.desc(), + transformed_ddy_channel + i * group_offset_out)); #else // PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc new file mode 100644 index 0000000000000..721354954c703 --- /dev/null +++ b/paddle/fluid/operators/copy_cross_scope_op.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type_traits.h" + +namespace paddle { +namespace framework { +class OpDesc; +template +class EmptyGradOpMaker; +} // namespace framework +namespace imperative { +class OpBase; +} // namespace imperative +} // namespace paddle + +using LoDTensor = paddle::framework::LoDTensor; +using Tensor = paddle::framework::Tensor; + +namespace paddle { +namespace operators { + +class CopyCrossScopeOp : public framework::OperatorBase { + public: + CopyCrossScopeOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext* ctx) const {} + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + int num_micro_scopes = scope.kids().size(); + int num_micro_batches = Attr("num_micro_batches"); + bool ToM = Attr("to_main_scope"); + PADDLE_ENFORCE_EQ(num_micro_scopes, num_micro_batches, + platform::errors::InvalidArgument( + "For pipeline, number of micro scopes (%d) should " + "be equal to number of micro batches (%d).", + num_micro_scopes, num_micro_batches)); + const std::string& id_name = Input("Id"); + auto* id_var = scope.FindVar(id_name); + PADDLE_ENFORCE_NOT_NULL( + id_var, + platform::errors::NotFound("No variable with name %s found.", id_name)); + auto id_tensor = id_var->GetMutable(); + auto it = scope.kids().begin(); + framework::Tensor cpu_id_tensor; + TensorCopySync(*id_tensor, platform::CPUPlace(), &cpu_id_tensor); + auto id_value = cpu_id_tensor.data(); + for (auto i = 0; i < *id_value; i++) { + it++; + } + if (it == scope.kids().end()) { + if (ToM) { + auto dst_scope = *it; + const std::string& x_name = Input("X"); + auto* dst_var = dst_scope->FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + dst_var, + platform::errors::NotFound( + "No variable with name %s found in source scope.", x_name)); + auto* main_var = scope.FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + main_var, + platform::errors::NotFound( + "No variable with name %s found in destination scope.", + x_name)); + auto dst_tensor = dst_var->GetMutable(); + auto main_tensor = main_var->GetMutable(); + TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor); + } + return; + } + auto source_scope = *it; + it++; + auto dst_scope = *it; + const std::string& x_name = Input("X"); + auto* source_var = source_scope->FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + source_var, + platform::errors::NotFound( + "No variable with name %s found in source scope.", x_name)); + auto* dst_var = dst_scope->FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + dst_var, + platform::errors::NotFound( + "No variable with name %s found in destination scope.", x_name)); + auto src_tensor = source_var->GetMutable(); + auto dst_tensor = dst_var->GetMutable(); + TensorCopySync(*src_tensor, dst_tensor->place(), dst_tensor); + + if (ToM) { + auto* main_var = scope.FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + main_var, + platform::errors::NotFound( + "No variable with name %s found in destination scope.", x_name)); + auto main_tensor = main_var->GetMutable(); + TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor); + } + } +}; + +class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor), The first input tensor of copy_cross_scope op, which " + "is copying micro scope."); + AddInput("Id", + "(Tensor), The second input tensor of copy_cross_scope op, which " + "is a id of the current micro scope."); + AddAttr("to_main_scope", "Return current scope to main scope.") + .SetDefault(false); + AddAttr("num_micro_batches", "Number of micro batches for pipeline."); + AddComment(R"DOC( + This op is used by pipeline to copy tensors across micro batch scopes. + Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position. + If need to copy back to the main scope, using to_main_scope option to copy the variable value of + the current micro scope to the main scope. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_WITHOUT_GRADIENT(copy_cross_scope, ops::CopyCrossScopeOp, + ops::CopyCrossScopeOpMaker); diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc new file mode 100644 index 0000000000000..e175b235f9c18 --- /dev/null +++ b/paddle/fluid/operators/copy_cross_scope_test.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/copy_cross_scope_op.cc" +#include "paddle/fluid/string/printf.h" + +#define Conn(x, y) x##y + +namespace f = paddle::framework; +namespace p = paddle::platform; + +USE_NO_KERNEL_OP(copy_cross_scope); + +template +void Compare1(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto var_x = scope->Var("tmp"); + auto x = var_x->GetMutable(); + std::vector main_x = {1.0}; + TensorFromVector(main_x, ctx, x); + + auto var_id = scope->Var("Id"); + auto id = var_id->GetMutable(); + std::vector main_id = {1}; + TensorFromVector(main_id, ctx, id); + for (int i = 0; i < 3; i++) { + auto& child_scope = scope->NewScope(); + auto child_var = child_scope.Var("tmp"); + auto tensor_x = child_var->GetMutable(); + std::vector init_x = {static_cast(i)}; + TensorFromVector(init_x, ctx, tensor_x); + } + + ctx.Wait(); + + // run + f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}}; + std::map> output; + auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}}, + output, attrs); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + ctx.Wait(); + + std::list::const_iterator iter = scope->kids().begin(); + iter++; + iter++; + + auto* kid_scope = *iter; + auto* dst_var = kid_scope->FindVar("tmp"); + auto* tensor_out = dst_var->GetMutable(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + int expected = 1; + EXPECT_EQ(static_cast(out_vec[0]), expected); +} + +template +void Compare2(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto var_x = scope->Var("tmp"); + auto x = var_x->GetMutable(); + std::vector main_x = {1.0}; + TensorFromVector(main_x, ctx, x); + + auto var_id = scope->Var("Id"); + auto id = var_id->GetMutable(); + std::vector main_id = {0}; + TensorFromVector(main_id, ctx, id); + for (int i = 0; i < 3; i++) { + auto& child_scope = scope->NewScope(); + auto child_var = child_scope.Var("tmp"); + auto tensor_x = child_var->GetMutable(); + std::vector init_x = {static_cast(i)}; + TensorFromVector(init_x, ctx, tensor_x); + } + + ctx.Wait(); + + // run + f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}}; + std::map> output; + auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}}, + output, attrs); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + ctx.Wait(); + + auto* dst_var = scope->FindVar("tmp"); + auto* tensor_out = dst_var->GetMutable(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + int expected = 0; + EXPECT_EQ(static_cast(out_vec[0]), expected); +} + +#ifdef PADDLE_WITH_CUDA +TEST(copy_cross_scope, CUDA_fp32) { + f::Scope scope; + p::CUDADeviceContext ctx(p::CUDAPlace(0)); + Compare1(&scope, ctx, "copy_cross_scope"); +} + +TEST(copy_cross_scope_to_main_scope, CUDA_fp32) { + f::Scope scope; + p::CUDADeviceContext ctx(p::CUDAPlace(0)); + Compare2(&scope, ctx, "copy_cross_scope"); +} +#elif PADDLE_WITH_ASCEND_CL +TEST(copy_cross_scope, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare1(&scope, ctx, "copy_cross_scope"); +} + +TEST(copy_cross_scope_to_main_scope, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare2(&scope, ctx, "copy_cross_scope"); +} +#endif diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu index 2e03622e10f0f..7e3ab6be664cb 100644 --- a/paddle/fluid/operators/diag_embed_op.cu +++ b/paddle/fluid/operators/diag_embed_op.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/diag_embed_op.h" diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h index a2279e40623b4..6a34ef48a169d 100644 --- a/paddle/fluid/operators/dist_op.h +++ b/paddle/fluid/operators/dist_op.h @@ -167,6 +167,7 @@ static void DistGradFunction(const framework::ExecutionContext& context) { auto sign = (x_minux_y > static_cast(0)).template cast() * static_cast(1.0) + (x_minux_y < static_cast(0)).template cast() * static_cast(-1.0); + T epsilon = static_cast(1.0e-10f); // 1: Lp-norm(z), z = x-y, compute dz if (p == 0) { @@ -189,12 +190,14 @@ static void DistGradFunction(const framework::ExecutionContext& context) { // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout if (platform::is_cpu_place(context.GetPlace())) { grad_t.device(place) = - (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * + (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) + .pow(p - 1) * sign.eval() * out_grad_t.broadcast(out_bcast_dims); } else { grad_t.device(place) = - (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign * - out_grad_t.broadcast(out_bcast_dims); + (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) + .pow(p - 1) * + sign * out_grad_t.broadcast(out_bcast_dims); } } diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt deleted file mode 100644 index c9db6148bc45d..0000000000000 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ /dev/null @@ -1,76 +0,0 @@ -return() - -if(WITH_GRPC) - set(cc_generic_services "false") -else() - set(cc_generic_services "true") -endif() -configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) - -cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool) -cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder) - -cc_library(heart_beat_monitor SRCS heart_beat_monitor.cc DEPS enforce simple_threadpool) -cc_library(large_scale_kv SRCS large_scale_kv.cc DEPS enforce simple_threadpool device_context) -cc_test(heart_beat_monitor_test SRCS heart_beat_monitor_test.cc DEPS heart_beat_monitor) - -# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files -set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") -if(WITH_GRPC) - set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr zlib protobuf) - set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc) - grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc - variable_response.cc - collective_client.cc collective_server.cc - ${GRPC_SRCS} - PROTO send_recv.proto - DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder heart_beat_monitor large_scale_kv) - - set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) - - cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc - DEPS ${RPC_DEPS} scope profiler math_function) - -else() - set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) - set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - - set(BRPC_DEPS brpc ssl crypto protobuf leveldb zlib) - - brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc - variable_response.cc - collective_client.cc collective_server.cc - ${BRPC_SRCS} - PROTO send_recv.proto - DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS}) - - set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS}) - cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc - DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_read_op) -endif() - - -cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op checkpoint_notify_op scale_op ) -cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) -cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory node) -cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) -cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) -cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator) -cc_test(communicator_test SRCS communicator_test.cc DEPS communicator) -if(WITH_GPU OR WITH_ROCM) - cc_test(collective_server_test SRCS collective_server_test.cc - DEPS sendrecvop_rpc executor ${RPC_DEPS} - selected_rows_functor scope math_function) -endif() -if(WITH_TESTING) - if(TEST rpc_server_test) - set_tests_properties(rpc_server_test PROPERTIES TIMEOUT 120) - endif() - if(TEST heart_beat_monitor_test) - set_tests_properties(heart_beat_monitor_test PROPERTIES TIMEOUT 120) - endif() -endif() diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h deleted file mode 100644 index 28a5f2ad6c764..0000000000000 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class ConcurrentSet { - public: - ConcurrentSet() : pool_(new ::ThreadPool(1)) {} - ~ConcurrentSet() {} - - std::future Update(const std::vector& rows) { - auto task = [this, rows] { - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& id : rows) { - sstream << id << ", "; - } - sstream << "]"; - VLOG(3) << "update ids -> " << sstream.str(); - } - for (auto row : rows) { - set_.insert(row); - } - }; - return pool_->enqueue(std::move(task)); - } - - std::future GetAndClear(std::vector* result) { - auto task = [this, &result] { - result->clear(); - for (auto& id : set_) { - result->push_back(id); - } - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& id : *result) { - sstream << id << ", "; - } - sstream << "]"; - VLOG(3) << "result ids size: " << result->size() << " " - << sstream.str(); - } - set_.clear(); - }; - return pool_->enqueue(std::move(task)); - } - - private: - std::unordered_set set_; - std::unique_ptr<::ThreadPool> pool_{nullptr}; -}; - -class AsyncSparseParamUpdateRecorder { - using TrainerToRows = std::vector>; - - public: - AsyncSparseParamUpdateRecorder( - int trainer_num, - const std::unordered_map& grad_to_param) - : trainer_num_(trainer_num), grad_to_param_(grad_to_param) { - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& item : grad_to_param) { - sstream << item.first << ":" << item.second << ", "; - } - sstream << "]"; - VLOG(3) << "trainer_num: " << trainer_num - << " grad_to_param_: " << sstream.str(); - } - for (auto& iter : grad_to_param) { - param_to_grad_[iter.second] = iter.first; - auto& param_name = iter.second; - param_to_updated_rows_[param_name] = TrainerToRows(); - auto& trainer_to_rows = param_to_updated_rows_[param_name]; - for (auto i = 0; i < trainer_num; ++i) { - trainer_to_rows.emplace_back(new ConcurrentSet()); - } - } - } - - ~AsyncSparseParamUpdateRecorder() = default; - - void Update(const std::string& grad_name, - const std::vector& update_rows) { - VLOG(3) << "update grad: " << grad_name - << " row size: " << update_rows.size(); - auto& param_name = grad_to_param_.at(grad_name); - auto& trainer_to_rows = param_to_updated_rows_.at(param_name); - - std::vector> fs; - for (auto& set : trainer_to_rows) { - fs.push_back(set->Update(update_rows)); - } - for (auto& f : fs) { - f.wait(); - } - } - - void GetAndClear(const std::string& param_name, int trainer_id, - std::vector* result) { - VLOG(3) << "GetAndClear param: " << param_name - << " for trainer: " << trainer_id; - PADDLE_ENFORCE_LT( - trainer_id, trainer_num_, - platform::errors::InvalidArgument( - "The value of trainer_id: %s should less than trainer_num: %s.", - trainer_id, trainer_num_)); - param_to_updated_rows_.at(param_name)[trainer_id] - ->GetAndClear(result) - .wait(); - } - - bool HasParam(const std::string& param_name) { - return param_to_grad_.find(param_name) != param_to_grad_.end(); - } - - bool HasGrad(const std::string& grad_name) { - return grad_to_param_.find(grad_name) != grad_to_param_.end(); - } - - private: - const int trainer_num_; - std::unordered_map grad_to_param_; - std::unordered_map param_to_grad_; - std::unordered_map param_to_updated_rows_; - - // init recorder - public: - static void Init( - int trainer_num, - const std::unordered_map& grad_to_param) { - InitImpl(trainer_num, grad_to_param); - } - - static AsyncSparseParamUpdateRecorder* GetInstance() { - return recorder_.get(); - } - - private: - // Init is called by GetInstance. - static void InitImpl( - int trainer_num, - const std::unordered_map& grad_to_param) { - if (recorder_ == nullptr) { - recorder_.reset( - new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param)); - } - } - - static std::once_flag init_flag_; - static std::unique_ptr recorder_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc deleted file mode 100644 index 2d78559625c91..0000000000000 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" -#include -#include "gtest/gtest.h" - -namespace paddle { -namespace operators { -namespace distributed { - -TEST(ConcurrentSet, All) { - ConcurrentSet concurrent_set; - std::vector in1 = {1, 2, 3, 4}; - std::vector in2 = {2, 3, 5, 6}; - - std::vector> futures; - futures.push_back(concurrent_set.Update(in1)); - futures.push_back(concurrent_set.Update(in2)); - - for (auto &f : futures) { - f.wait(); - } - - std::unordered_set in; - std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); - std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); - - std::vector ret; - concurrent_set.GetAndClear(&ret).wait(); - - std::unordered_set out; - std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); - - EXPECT_EQ(in, out); - - concurrent_set.GetAndClear(&ret).wait(); - EXPECT_EQ(ret.size(), 0UL); -} - -TEST(AsyncSparseParamUpdateRecorder, All) { - std::unordered_map grad_to_param; - grad_to_param["grad1"] = "param1"; - grad_to_param["grad2"] = "param2"; - - int trainer_num = 10; - - AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param); - std::vector in1 = {1, 2, 3, 4}; - std::vector in2 = {2, 3, 5, 6}; - - std::unordered_set in; - std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); - std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); - - recorder.Update("grad1", in1); - recorder.Update("grad1", in2); - - EXPECT_TRUE(recorder.HasParam("param1")); - EXPECT_TRUE(recorder.HasParam("param2")); - EXPECT_FALSE(recorder.HasParam("param3")); - - EXPECT_TRUE(recorder.HasGrad("grad1")); - EXPECT_TRUE(recorder.HasGrad("grad2")); - EXPECT_FALSE(recorder.HasGrad("grad3")); - - std::vector ret; - EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret)); - - for (int i = 0; i < trainer_num; ++i) { - std::vector ret; - std::unordered_set out; - - recorder.GetAndClear("param1", i, &ret); - std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); - - EXPECT_EQ(in, out); - - recorder.GetAndClear("param1", i, &ret); - EXPECT_EQ(ret.size(), 0UL); - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc deleted file mode 100644 index b2a26089c8689..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc +++ /dev/null @@ -1,462 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { -namespace distributed { - -DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); -DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); - -BRPCClient::~BRPCClient() { Wait(); } - -void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used by other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to send variable %s, error text is %s.", var_h->name(), - cntl->ErrorText())); - var_h->Finish(false); - cls->DecreaseReqCount(); - return; - } - var_h->Finish(true); - cls->DecreaseReqCount(); - - VLOG(4) << "HandleSendResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - VLOG(4) << "Finish HandleSendResponse"; -} - -VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - const std::string method = kSendRPC; - VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); - - auto* var = p_scope->FindVar(var_name_val); - sendrecv::VariableMessage request; - distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request, - &cntl->request_attachment(), "", false, - trainer_id_); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - platform::RecordRPCEvent record_event(method); - - ch_ctx->stub->SendVariable(cntl, &request, response, done); - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - req_count_++; - - return var_h; -} -void HandleFetchBarrierResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to get HandleFetchBarrierResponse %s, error text is %s.", - var_h->name(), cntl->ErrorText())); - var_h->Finish(false); - cls->DecreaseReqCount(); - return; - } - - var_h->Finish(true); - cls->DecreaseReqCount(); - - VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - VLOG(4) << "Finish HandleFetchBarrierResponse"; -} -void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, VarHandlePtr var_h, - ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx, - BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to get variable %s, error text is %s.", var_h->name(), - cntl->ErrorText())); - cls->DecreaseReqCount(); - var_h->Finish(false); - return; - } - - VLOG(4) << "HandleGetResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - - framework::Variable* outvar = nullptr; - int trainer_id; - distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(), - *var_h->ctx(), var_h->scope(), &outvar, - &trainer_id); - VLOG(4) << "Finish HandleGetResponse"; - cls->DecreaseReqCount(); - var_h->Finish(true); -} - -VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& method_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const std::string out_varname_val = out_var_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - const std::string method = kGetRPC; - VarHandlePtr var_h( - new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_out_varname(out_varname_val); - req.set_trainer_id(trainer_id_); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - platform::RecordRPCEvent record_event(method); - - if (method_name == kGetMonomerRPC) { - ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); - } else if (method_name == kGetNoBarrierRPC) { - ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done); - } else { - ch_ctx->stub->GetVariable(cntl, &req, response, done); - } - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - - req_count_++; - - return var_h; -} - -VarHandlePtr BRPCClient::AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_var_name, int64_t time_out) { - std::string var_name_no_barrier = - string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); - - return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name, - kGetNoBarrierRPC, time_out); -} - -VarHandlePtr BRPCClient::AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC, - time_out); -} - -VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, - const std::string& var_name, - int64_t time_out) { - return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out); -} - -VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, - time_out); -} - -VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string in_var_name_val = in_var_name; - const std::string out_var_name_val = out_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - - const std::string method = kPrefetchRPC; - - VarHandlePtr var_h( - new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - auto* var = p_scope->FindVar(in_var_name_val); - sendrecv::VariableMessage req; - distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req, - &cntl->request_attachment(), out_var_name_val, - false, 0, table_name_val); - - platform::RecordRPCEvent record_event(method); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - ch_ctx->stub->PrefetchVariable(cntl, &req, response, done); - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - - req_count_++; - return var_h; -} - -VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE, - time_out); -} - -VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { - auto ch_ptr = GetChannel(ep); - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - sendrecv::VariableMessage req; - req.set_varname(FETCH_BARRIER_MESSAGE); - - const std::string method = kFetchBarrierRPC; - // var handle - VarHandlePtr var_h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); - - platform::RecordRPCEvent record_event(method); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - ch_ctx->stub->GetVariable(cntl, &req, response, done); - - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - - return var_h; -} - -bool BRPCClient::Wait() { - VLOG(9) << "begin to brpcclient wait"; - { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return req_count_ == 0; }); - } - VLOG(9) << "end to brpcclient wait"; - return true; -} - -ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { - VLOG(4) << "begin to GetChannel:" << ep; - { - std::lock_guard guard(chan_mutex_); - auto it = channels_.find(ep); - if (it != channels_.end()) { - VLOG(4) << "end to GetChannel:" << ep; - return it->second; - } - } - - ChannelQueuePtr q(new framework::BlockingQueue()); - - brpc::ChannelOptions options; -#ifdef PADDLE_WITH_BRPC_RDMA - options.use_rdma = true; -#endif - options.protocol = "baidu_std"; - // don't use pooled type. the server can't afford that. - options.connection_type = "single"; - options.connect_timeout_ms = 1000; - options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; - options.max_retry = FLAGS_max_retry; - - VLOG(1) << "create " << brpc_channel_num_per_server_ - << " brpc channels to pserver:" << ep; - - for (int i = 0; i < brpc_channel_num_per_server_; ++i) { - std::shared_ptr c(new ChannelContext()); - if (c->channel.Init(ep.c_str(), &options) != 0) { - PADDLE_THROW( - platform::errors::Unavailable("Failed to initialize channel.")); - return nullptr; - } - - c->stub.reset(new sendrecv::SendRecvService_Stub( - static_cast(&c->channel))); - q->Push(c); - } - - { - std::lock_guard guard(chan_mutex_); - channels_[ep] = q; - } - - VLOG(4) << "end to GetChannel:" << ep; - return q; -} - -VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, - int64_t time_out) { - return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out); -} - -void BRPCClient::SendComplete() { - for (auto& kv : channels_) { - AsyncSendComplete(kv.first); - } -} - -VarHandlePtr BRPCClient::AsyncSendVarMessage( - const std::string& ep, const std::string& method_name, - const sendrecv::VariableMessage& req, int64_t time_out) { - auto ch_ptr = GetChannel(ep); - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); - - platform::RecordRPCEvent record_event(method_name); - - VarHandlePtr var_h( - new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - if (method_name == kCheckPointNotifyRPC) { - ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); - } else if (method_name == kSendMonomerFetchBarrierRPC) { - ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); - } else { - ch_ctx->stub->SendVariable(cntl, &req, response, done); - } - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - - return var_h; -} - -VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep, - const std::string& method_name, - const std::string& message, - int64_t time_out) { - sendrecv::VariableMessage req; - req.set_varname(message); - - return AsyncSendVarMessage(ep, method_name, req, time_out); -} - -VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dirname, - const std::string& varname, - const int mode, - int64_t time_out) { - sendrecv::VariableMessage req; - req.set_varname(varname); - req.set_out_varname(dirname); - - return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h deleted file mode 100644 index 91f94b4c9d5a3..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include // NOLINT -#include -#include -#include -#include -#include -#include // NOLINT -#include -#include -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN - -namespace paddle { -namespace operators { -namespace distributed { - -struct ChannelContext { - brpc::Channel channel; - std::shared_ptr stub; -}; - -typedef std::shared_ptr ChannelContextPtr; -typedef std::shared_ptr> - ChannelQueuePtr; - -class BRPCClient : public RPCClient { - public: - BRPCClient() {} - virtual ~BRPCClient(); - - VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline); - - VarHandlePtr AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) override; - - bool Wait() override; - - void SendComplete() override; - - private: - VarHandlePtr _AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_var_name, const std::string& method_name, - const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline); - - void Proceed(); - ChannelQueuePtr GetChannel(const std::string& ep); - - VarHandlePtr AsyncSendComplete(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline); - - VarHandlePtr AsyncSendMessage(const std::string& ep, - const std::string& method_name, - const std::string& message, int64_t time_out); - - VarHandlePtr AsyncSendVarMessage(const std::string& ep, - const std::string& method_name, - const sendrecv::VariableMessage& req, - int64_t time_out); - - friend void HandleSendResponse(brpc::Controller* cntl, - sendrecv::VoidMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls); - - friend void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls); - - friend void HandleFetchBarrierResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, - ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, - BRPCClient* cls); - void DecreaseReqCount() { - if (--req_count_ <= 0) { - sync_cond_.notify_all(); - } - } - - private: - std::unordered_map channels_; - - // mutex for Wait client sync - std::mutex sync_mutex_; - std::condition_variable sync_cond_; - std::atomic req_count_{0}; - - static constexpr int brpc_channel_num_per_server_ = 4; - - // mutex for GetChannel thread safety - std::mutex chan_mutex_; - DISABLE_COPY_AND_ASSIGN(BRPCClient); -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc deleted file mode 100644 index 94f0b9919ace8..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifdef PADDLE_WITH_BRPC_RDMA - -#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" -#include "brpc/channel.h" -#include "brpc/rdma/rdma_helper.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -RdmaMemPool& RdmaMemPool::Instance() { - static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool(); - return *g_rdma_mem_pool; -} - -void* RdmaMemPool::Find(const std::string& varname, int64_t size) { - pthread_rwlock_rdlock(&access_); - auto it = pool_.find(varname); - if (it == pool_.end()) { - pthread_rwlock_unlock(&access_); - return nullptr; - } - - auto info = it->second; - if (info.data_size != size) { - pthread_rwlock_unlock(&access_); - PADDLE_THROW(platform::errors::InvalidArgument( - "var:%s size:%ld != %ld", varname, size, info.data_size)); - return nullptr; - } - - pthread_rwlock_unlock(&access_); - return info.data; -} - -void RdmaMemPool::Register(const std::string& varname, void* data, - int64_t data_size) { - void* old = Find(varname, data_size); - if (old != nullptr) { - PADDLE_ENFORCE_EQ( - data, old, platform::errors::InvalidArgument("var:%s data:%ld != %ld", - varname, data, old)); - VLOG(7) << "Find on rdma:" << varname << " data:" << data - << " data_size:" << data_size; - return; - } - - VarInfo info; - info.data = data; - info.data_size = data_size; - - pthread_rwlock_wrlock(&access_); - pool_[varname] = info; - pthread_rwlock_unlock(&access_); - - if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) { - PADDLE_THROW(platform::errors::Unavailable( - "Register memory for RDMA failed. Register %s data: %s data size %d " - "error.", - varname, data, data_size)); - } - - VLOG(4) << "register on rdma:" << varname << " data:" << data - << " data_size:" << data_size; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle - -#endif diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h deleted file mode 100644 index 156a93ec57847..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#ifdef PADDLE_WITH_BRPC_RDMA - -#include // NOLINT -#include -#include - -namespace paddle { -namespace operators { -namespace distributed { - -/* - * This class is used to avoid duplicated registion of brpc::rdma. - */ -class RdmaMemPool { - public: - static RdmaMemPool& Instance(); - RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {} - - virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); } - - void Register(const std::string& varname, void* data, int64_t size); - void* Find(const std::string& varname, int64_t size); - - private: - struct VarInfo { - void* data; - int64_t data_size; - - VarInfo() : data(nullptr), data_size(0) {} - }; - - private: - std::unordered_map pool_; - pthread_rwlock_t access_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle - -#endif diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc deleted file mode 100644 index 411c0f36debd3..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_NCCL -#include -#endif -#ifdef PADDLE_WITH_RCCL -#include -#endif -#include -#include -#include -#include // NOLINT - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class IOBufWriter { - public: - static void Append(const std::string& varname, butil::IOBuf* iobuf, int k, - const char* v, int64_t vlen) { - if (vlen >= std::numeric_limits::max() || vlen < 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Variable lenght is invalid. Variable name is %s, length is %d.", - varname, vlen)); - } - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - iobuf->append(v, vlen); - } - - static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v, - int64_t vlen, bool in_cuda_pinned, - void (*destroy)(void*), void* user_data) { - VLOG(7) << "AppendTCPZeroCopy " - << " k:" << k - << " data:" << static_cast(const_cast(v)) - << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - - // FIXME(gongwb): use append_zerocopy - /* - if (in_cuda_pinned) { - iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory); - } else { - iobuf->append_zerocopy(v, vlen, nullptr); - } - */ - iobuf->append(v, vlen); - destroy(user_data); - } - -#ifdef PADDLE_WITH_BRPC_RDMA - static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf, - int k, const char* v, int64_t vlen, - bool in_cuda_pinned, void (*destroy)(void*), - void* user_data) { - VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k - << " data:" << static_cast(const_cast(v)) - << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - - RdmaMemPool::Instance().Register( - varname, static_cast(const_cast(v)), vlen); - - // FIXME(gongwb): use append_zerocopy - // iobuf->append_zerocopy(v, vlen, nullptr); - iobuf->append(v, vlen); - destroy(user_data); - return; - } -#endif - - static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf, - int k, const char* v, int64_t vlen, - bool in_cuda_pinned, void (*destroy)(void*), - void* user_data) { - if (vlen >= std::numeric_limits::max() || vlen < 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Variable lenght is invalid. Variable name is %s, length is %d.", - varname, vlen)); - } - -#ifdef PADDLE_WITH_BRPC_RDMA - IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, - destroy, user_data); -#else - IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy, - user_data); -#endif - } -}; - -void SerializeToIOBuf(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, VarMsg* request, - butil::IOBuf* iobuf, const std::string& out_varname, - bool var_is_not_stable, int trainer_id, - const std::string& table_name) { - std::unique_ptr payload; - - request->set_varname(name); - request->set_trainer_id(trainer_id); - // Note: normally the profiler is enabled in 1 trainer, hence only - // 1 trainer returns true for ShouldSendProfileState(). It tells PS - // servers the trainer's profiling state so that PS can follow the - // trainer. - if (platform::ShouldSendProfileState()) { - if (platform::IsProfileEnabled()) { - request->set_profile(platform::kEnableProfiler); - } else { - request->set_profile(platform::kDisableProfiler); - } - } - if (!out_varname.empty()) { - request->set_out_varname(out_varname); - } - if (!table_name.empty()) { - request->set_table_name(table_name); - } - if (var->IsType()) { - request->set_type(::sendrecv::LOD_TENSOR); - payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request))); - } else if (var->IsType()) { - request->set_type(::sendrecv::SELECTED_ROWS); - payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request))); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - } else if (var->IsType()) { - request->set_type(::sendrecv::NCCL_ID); - const ncclUniqueId& uid = var->Get(); - // TODO(gongwb): use append_zero to avoid data copy. - IOBufWriter::Append(name, iobuf, - sendrecv::VariableMessage::kSerializedFieldNumber, - uid.internal, NCCL_UNIQUE_ID_BYTES); - return; -#endif - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Serialize does not support type: %s", typeid(var->Type()).name())); - } - - PADDLE_ENFORCE_NOT_NULL( - payload, - platform::errors::InvalidArgument( - "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS.", - var->Type())); - - // FIXME(gongwb): it seems that can use zero copy. - if (var_is_not_stable) { - IOBufWriter::Append( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size()); - } else { - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - IOBufWriter::AppendZeroCopy( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size(), - true, SerializeDestroyCallback, static_cast(payload.get())); - payload.release(); -#endif - } else { - IOBufWriter::AppendZeroCopy( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size(), - false, SerializeDestroyCallback, static_cast(payload.get())); - payload.release(); - } - } - - if (var->IsType()) { - auto* slr = var->GetMutable(); - PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(), - platform::errors::InvalidArgument( - "Got wrong type: %s, expect type: int64_t", - VectorElemName(slr->rows()))); - size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - - IOBufWriter::Append(name, iobuf, - ::sendrecv::VariableMessage::kRowsFieldNumber, - reinterpret_cast(slr->rows().data()), - static_cast(rows_memory_size)); - } -} - -void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta, - const butil::IOBuf& iobuf, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - operators::distributed::BRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(iobuf, meta), 0, - platform::errors::InvalidArgument("parse iobuf to tensor error!")); - *var = resp.GetVar(); - *trainer_id = resp.GetTrainerId(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h deleted file mode 100644 index a5bdc331eb29c..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" - -namespace paddle { -namespace operators { -namespace distributed { - -void SerializeToIOBuf(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, VarMsg* request, - butil::IOBuf* iobuf, const std::string& out_varname, - bool var_is_not_stable, const int trainer_id = 0, - const std::string& table_name = std::string()); - -void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc deleted file mode 100644 index bcf20ad076b11..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc +++ /dev/null @@ -1,175 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT - -#include "brpc/channel.h" -#include "google/protobuf/text_format.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/variable_response.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/printf.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace operators = paddle::operators; -namespace math = paddle::operators::math; -namespace memory = paddle::memory; - -void RunSerdeTestSelectedRows(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - butil::IOBuf iobuf; - sendrecv::VariableMessage msg; - int tensor_numel = 564 * 128; - - // serialize var to IOBuf - { - framework::Variable var; - auto* slr = var.GetMutable(); - slr->set_height(1000); - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({564, 128})); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 564; ++i) rows->push_back(i); - - operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, - "", false); - } - - // desrialize - { - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::BRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(iobuf, msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto* slr2 = var2->GetMutable(); - auto* tensor2 = slr2->mutable_value(); - auto* rows2 = slr2->mutable_rows(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(*tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2->data()); - } - const int64_t* rows_data2 = rows2->data(); - - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); - } - for (size_t i = 0; i < rows2->size(); ++i) { - EXPECT_EQ(rows_data2[i], static_cast(i)); - } - EXPECT_EQ(slr2->height(), 1000); - } -} - -void RunTestLodTensor(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // serialize var to ByteBuffer - butil::IOBuf iobuf; - sendrecv::VariableMessage msg; - int tensor_numel = 512 * 8 * 4 * 2; - { - framework::Variable var; - auto* tensor = var.GetMutable(); - tensor->Resize(framework::make_ddim({512, 8, 4, 2})); - framework::LoD lod; - lod.push_back(framework::Vector({1, 3, 8})); - tensor->set_lod(lod); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 31.9); - - operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, - "", false); - } - - // check sendrecv::VariableMessage meta data - { - EXPECT_EQ(msg.varname(), "myvar"); - EXPECT_EQ(msg.type(), 0); - EXPECT_EQ(msg.dims()[0], 512); - EXPECT_EQ(msg.dims()[1], 8); - EXPECT_EQ(msg.dims()[2], 4); - EXPECT_EQ(msg.dims()[3], 2); - EXPECT_EQ(msg.lod_level(), 1); - EXPECT_EQ(msg.lod(0).lod_data(0), 1); - EXPECT_EQ(msg.lod(0).lod_data(1), 3); - EXPECT_EQ(msg.lod(0).lod_data(2), 8); - } - - // deserialize - { - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::BRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(iobuf, msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto tensor2 = var2->Get(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2.data()); - } - - for (int i = 0; i < tensor_numel; ++i) - EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); - } -} - -TEST(LodTensor, Run) { - platform::CPUPlace place; - RunTestLodTensor(place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu(0); - RunTestLodTensor(gpu); -#endif -} - -TEST(SelectedRows, Run) { - platform::CPUPlace place; - RunSerdeTestSelectedRows(place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu; - RunSerdeTestSelectedRows(gpu); -#endif -} diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc deleted file mode 100644 index 5ca26f006bf20..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" -#include -#include -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace sendrecv { - -namespace distributed = paddle::operators::distributed; - -typedef std::unordered_map - HandlerMap; - -class BRPCServiceImpl : public SendRecvService { - public: - explicit BRPCServiceImpl(const HandlerMap& rpc_call_map, - distributed::RPCServer* rpc_server) - : rpc_server_(rpc_server) { - VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size(); - auto it = rpc_call_map.find(distributed::kRequestSend); - if (it != rpc_call_map.end()) { - request_send_h_ = it->second; - send_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestSend))); - } - - it = rpc_call_map.find(distributed::kRequestGet); - if (it != rpc_call_map.end()) { - request_get_h_ = it->second; - get_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestGet))); - } - - it = rpc_call_map.find(distributed::kRequestGetNoBarrier); - if (it != rpc_call_map.end()) { - request_getnobarrier_h_ = it->second; - getnobarrier_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier))); - } - - it = rpc_call_map.find(distributed::kRequestPrefetch); - if (it != rpc_call_map.end()) { - request_prefetch_h_ = it->second; - prefetch_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); - } - - it = rpc_call_map.find(distributed::kRequestCheckpoint); - if (it != rpc_call_map.end()) { - request_checkpoint_h_ = it->second; - checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); - } - - it = rpc_call_map.find(distributed::kRequestGetMonomerVariable); - if (it != rpc_call_map.end()) { - request_get_monomer_handler_h_ = it->second; - } - - it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier); - if (it != rpc_call_map.end()) { - request_get_monomer_barrier_handler_h_ = it->second; - } - } - - virtual ~BRPCServiceImpl() {} - void SendVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - send_threads_->Run( - [=] { _SendVariable(cntl_butil, request, response, done); }); - } - - void _SendVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_send_h_, platform::errors::PreconditionNotMet( - "RequestSend handler should be registed first!")); - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - VLOG(3) << "RequestSend var_name:" << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - distributed::BRPCVariableResponse resp(request_send_h_->scope(), - request_send_h_->dev_ctx(), - request_send_h_->distributed_mode()); - PADDLE_ENFORCE_EQ( - resp.Parse(cntl->request_attachment(), *request), 0, - platform::errors::InvalidArgument("parse iobuf to tensor error!")); - - auto scope = resp.GetMutableLocalScope(); - auto invar = resp.GetVar(); - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = nullptr; - - request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id); - } - - void GetVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VariableMessage* response, - google::protobuf::Closure* done) override { - get_threads_->Run( - [=] { _GetVariable(cntl_butil, request, response, done); }); - } - - void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - getnobarrier_threads_->Run( - [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); }); - } - - void _GetVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_get_h_, platform::errors::PreconditionNotMet( - "RequestGet handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - std::string out_varname = request->out_varname(); - VLOG(3) << "RequestGet varname:" << varname - << ", out_varname:" << out_varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - auto scope = request_get_h_->scope(); - paddle::framework::Variable* invar = nullptr; - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = nullptr; - - request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - distributed::SerializeToIOBuf(out_varname, outvar, - *request_get_h_->dev_ctx(), response, - &cntl->response_attachment(), "", false); - } - } - - void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_getnobarrier_h_, - platform::errors::PreconditionNotMet( - "RequestGetNoBarrier handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - std::string out_varname = request->out_varname(); - int trainer_id = request->trainer_id(); - - VLOG(3) << "RequestGetNoBarrier varname:" << varname - << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id - << ", from:" << cntl->remote_side(); - - auto scope = request_getnobarrier_h_->scope(); - paddle::framework::Variable* invar = nullptr; - paddle::framework::Variable* outvar = nullptr; - - request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - distributed::SerializeToIOBuf( - out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response, - &cntl->response_attachment(), "", false); - } - } - - void PrefetchVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - prefetch_threads_->Run( - [=] { _PrefetchVariable(cntl_butil, request, response, done); }); - } - - void _PrefetchVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL(request_prefetch_h_, - platform::errors::PreconditionNotMet( - "kRequestPrefetch handler should be registed first!"); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - // prefetch process... - std::string in_var_name = request->varname(); - std::string out_var_name = request->out_varname(); - VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name - << ", out_var_name: " << out_var_name - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - distributed::BRPCVariableResponse resp( - request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true); - - PADDLE_ENFORCE_EQ(resp.Parse(cntl->request_attachment(), *request), 0, - platform::errors::InvalidArgument( - "parse iobuf to tensor error!")); - - auto scope = resp.GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - std::string table_name = request->table_name(); - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = scope->Var(out_var_name); - - request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - - distributed::SerializeToIOBuf(out_var_name, outvar, - *request_prefetch_h_->dev_ctx(), response, - &cntl->response_attachment(), "", true); - } - - void CheckpointNotify(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - checkpoint_notify_threads_->Run( - [=] { _CheckpointNotify(cntl_butil, request, response, done); }); - } - - void _CheckpointNotify(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_checkpoint_h_, - platform::errors::PreconditionNotMet( - "kRequestCheckpointNotify handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(), - request_checkpoint_h_->dev_ctx()); - - auto scope = resp.GetMutableLocalScope(); - - std::string checkpoint_notify = request->varname(); - std::string checkpoint_dir = request->out_varname(); - int trainer_id = request->trainer_id(); - - VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify - << ", dir: " << checkpoint_dir - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr, - trainer_id, checkpoint_dir); - } - - void GetMonomerVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - PADDLE_ENFORCE_NOT_NULL( - request_get_monomer_handler_h_, - platform::errors::PreconditionNotMet( - "kRequestGetMonomerVariable handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - // proc request. - std::string varname = request->varname(); - VLOG(3) << "GetMonomerVariable " << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - rpc_server_->WaitVarCond(varname); - distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); - - auto scope = h.scope_; - auto invar = scope->FindVar(varname); - paddle::framework::Variable* outvar = nullptr; - - request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar, - request->trainer_id()); - - if (outvar) { - distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response, - &cntl->response_attachment(), "", false); - } - } - - void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - PADDLE_ENFORCE_NOT_NULL( - request_get_monomer_barrier_handler_h_, - platform::errors::PreconditionNotMet( - "RequestGetMonomerBarrier handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - rpc_server_->WaitVarCond(varname); - distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); - - paddle::framework::Scope* scope = nullptr; - paddle::framework::Variable* invar = nullptr; - paddle::framework::Variable* outvar = nullptr; - - request_get_monomer_barrier_handler_h_->Handle( - varname, scope, invar, &outvar, request->trainer_id()); - } - - private: - distributed::RequestHandler* request_send_h_{nullptr}; - distributed::RequestHandler* request_get_h_{nullptr}; - distributed::RequestHandler* request_getnobarrier_h_{nullptr}; - distributed::RequestHandler* request_prefetch_h_{nullptr}; - distributed::RequestHandler* request_checkpoint_h_{nullptr}; - distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; - distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr}; - - distributed::RPCServer* rpc_server_{nullptr}; - - // FIXME(gongwb): brpc should support process one rpc use one threadpool. - std::unique_ptr send_threads_; - std::unique_ptr get_threads_; - std::unique_ptr getnobarrier_threads_; - std::unique_ptr prefetch_threads_; - std::unique_ptr checkpoint_notify_threads_; -}; -} // namespace sendrecv - -namespace paddle { -namespace operators { -namespace distributed { - -void AsyncBRPCServer::StartServer() { - // Instance of your service. - sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this); - - // Add the service into server. Notice the second parameter, because the - // service is put on stack, we don't want server to delete it, otherwise - // use brpc::SERVER_OWNS_SERVICE. - if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Failed to add service into BRPC server.")); - return; - } - - brpc::ServerOptions options; -#ifdef PADDLE_WITH_BRPC_RDMA - options.use_rdma = true; -#endif - options.idle_timeout_sec = idle_timeout_s_; - options.max_concurrency = max_concurrency_; - if (server_.Start(bind_address_.c_str(), &options) != 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Failed to start EchoServer %s.", bind_address_)); - return; - } - - butil::EndPoint ep = server_.listen_address(); - selected_port_ = ep.port; - - { - std::lock_guard lock(this->mutex_ready_); - ready_ = 1; - } - condition_ready_.notify_all(); - - server_.Join(); -} - -void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } - -void AsyncBRPCServer::WaitServerReady() { - VLOG(3) << "AsyncGRPCServer is wait server ready"; - std::unique_lock lock(this->mutex_ready_); - condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(3) << "AsyncGRPCServer WaitSeverReady"; -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h deleted file mode 100644 index 78bbe5adc0813..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // NOLINT -#include // NOLINT -#include - -#include "brpc/server.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class AsyncBRPCServer final : public RPCServer { - public: - explicit AsyncBRPCServer(const std::string& address, int client_num) - : RPCServer(address, client_num), ready_(0) {} - - virtual ~AsyncBRPCServer() {} - void StartServer() override; - void WaitServerReady() override; - - private: - void ShutDownImpl() override; - - brpc::Server server_; - - static constexpr int idle_timeout_s_ = -1; - static constexpr int max_concurrency_ = 0; - - std::mutex mutex_ready_; - std::condition_variable condition_ready_; - int ready_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc deleted file mode 100644 index 49521e8a77057..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -namespace paddle { -namespace operators { -namespace distributed { - -namespace pb = ::google::protobuf; -using vr = ::sendrecv::VariableMessage; - -int BRPCVariableResponse::Parse(Source* source) { - pb::io::ZeroCopyInputStream* input_stream = source->contents(); - pb::io::CodedInputStream input(input_stream); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); - - while (1) { - unsigned int tag = 0; - if (!input.ReadLittleEndian32(&tag)) { - break; - } - - uint64_t num_bytes = 0; - if (!input.ReadLittleEndian64(&num_bytes)) { - break; - } - - int field = static_cast(tag); - int ret = field == 0 ? -1 : field; - switch (field) { - case vr::kSerializedFieldNumber: { - if (!ProcSerializedField(field, &input, num_bytes)) { - return ret; - } - break; - } - case vr::kRowsFieldNumber: { - PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || - meta_.type() == sendrecv::LOD_TENSOR) && - meta_.varname() != "", - platform::errors::PreconditionNotMet( - "meta info should be got first!")); - - if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { - return ret; - } - break; - } - default: { - PADDLE_THROW(platform::errors::Unavailable( - "not surpported %u fieldnumber", field)); - return ret; - } - } - } - - return 0; -} -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h deleted file mode 100644 index 6282f08a72536..0000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" - -#include "paddle/fluid/operators/distributed/distributed_pb.h" - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class BRPCSourceWrapper : public Source { - public: - explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {} - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - return &source_; - } - - private: - butil::IOBufAsZeroCopyInputStream source_; -}; - -class BRPCVariableResponse : public VariableResponse { - public: - BRPCVariableResponse(const framework::Scope* scope, - const platform::DeviceContext* dev_ctx, - bool create_scope = false) - : VariableResponse(scope, dev_ctx, create_scope) {} - - virtual ~BRPCVariableResponse() {} - - // parse attachment from iobuf - int Parse(Source* source) override; - int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) { - BRPCSourceWrapper wrapper(iobuf); - return VariableResponse::Parse(&wrapper, meta); - } -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc deleted file mode 100644 index fcd3e6abead51..0000000000000 --- a/paddle/fluid/operators/distributed/collective_client.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/collective_client.h" -#include -#include "gflags/gflags.h" - -DECLARE_int32(rpc_deadline); - -namespace paddle { -namespace operators { -namespace distributed { -std::once_flag CollectiveClient::init_flag_; -std::unique_ptr CollectiveClient::client_(nullptr); - -bool CollectiveClient::Gather(const std::vector& remote_vars, - std::vector* dst, - const platform::DeviceContext& ctx, - framework::Scope* scope, int64_t time_out) { - for (auto r : remote_vars) { - VLOG(50) << "begin gather from ep:" << r.String(); - scope->Var(r.var_name_)->GetMutable(); - VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable( - r.ep_, ctx, *scope, r.var_name_, time_out); - } - - rpc_client_->Wait(); - - for (auto r : remote_vars) { - auto select_rows = - scope->FindVar(r.var_name_)->GetMutable(); - dst->push_back(select_rows); - - VLOG(4) << "gather from ep:" << r.String() - << ", select_rows:" << GetSelectedRowsInfo(*select_rows); - - rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_); - } - - rpc_client_->Wait(); - return true; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h deleted file mode 100644 index e7d8bb8df9834..0000000000000 --- a/paddle/fluid/operators/distributed/collective_client.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT -#include -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -class SelectedRows; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -DECLARE_int32(rpc_deadline); - -namespace paddle { -namespace operators { -namespace distributed { - -inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) { - std::stringstream ss; - ss << ", height:" << slr.height() << ", rows:["; - for (unsigned int i = 0; i < slr.rows().size(); i++) { - if (i != slr.rows().size() - 1) { - ss << slr.rows()[i] << ","; - } else { - ss << slr.rows()[i]; - } - } - ss << "], dims:" << slr.value().dims(); - return ss.str(); -} - -struct RemoteVar { - std::string ep_; - std::string var_name_; - int trainer_id_{0}; - - std::string String() { - std::stringstream ss; - ss << "ep:" << ep_ << ", var_name:" << var_name_ - << ", trainer_id:" << trainer_id_; - - return ss.str(); - } -}; - -class CollectiveClient { - public: - CollectiveClient() { - rpc_client_.reset(new RPCCLIENT_T()); - rpc_client_->InitImpl(); - } - virtual ~CollectiveClient() {} - - // note this function will retain the rank order. - bool Gather(const std::vector& remote_vars, - std::vector* dst, - const platform::DeviceContext& ctx, framework::Scope* scope, - int64_t time_out = FLAGS_rpc_deadline); - - static CollectiveClient* GetInstance() { - std::call_once(init_flag_, [&]() { - if (client_.get() == nullptr) { - client_.reset(new CollectiveClient()); - } - }); - return client_.get(); - } - - private: - std::unique_ptr rpc_client_; - - static std::once_flag init_flag_; - static std::unique_ptr client_; -}; -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc deleted file mode 100644 index cdd37742d2d5a..0000000000000 --- a/paddle/fluid/operators/distributed/collective_server.cc +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/distributed/collective_server.h" -#include - -DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get"); - -namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag CollectiveServer::init_flag_; -std::shared_ptr CollectiveServer::collective_server_(nullptr); - -CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) { - VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in; - rpc_server_.reset(new RPCSERVER_T(end_point, fan_in)); -} - -void CollectiveServer::Stop() { - rpc_server_->ShutDown(); - server_thread_->join(); - loop_thread_->join(); -} - -void CollectiveServer::StartServer() { - get_monomer_handler_.reset(new GetMonomerHandler()); - get_monomer_handler_->SetRPCServer(rpc_server_.get()); - - get_barrier_handler_.reset(new GetMonomerBarrierHandler()); - get_barrier_handler_->SetRPCServer(rpc_server_.get()); - - rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable, - get_monomer_handler_.get(), - FLAGS_collective_get_thread_num); - rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier, - get_barrier_handler_.get(), 1); - - server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); })); - rpc_server_->WaitServerReady(); - - loop_thread_.reset(new std::thread([&]() { - while (true) { - if (rpc_server_->IsExit()) { - LOG(WARNING) << "get exit!rpc_processor break!"; - break; - } - sleep(1); - } - VLOG(1) << "CollectiveServer loop_thread end"; - })); -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h deleted file mode 100644 index 4964923286094..0000000000000 --- a/paddle/fluid/operators/distributed/collective_server.h +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include // NOLINT -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class CollectiveServer; - -class GetMonomerHandler final : public RequestHandler { - public: - GetMonomerHandler() : RequestHandler(true) {} - virtual ~GetMonomerHandler() {} - bool Handle(const std::string& var_name, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override { - VLOG(50) << "GetMonomerHandler recv " << var_name; - - *outvar = scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL( - outvar, platform::errors::NotFound("var: %s is not found.", var_name)); - - return true; - } -}; - -class GetMonomerBarrierHandler final : public RequestHandler { - public: - GetMonomerBarrierHandler() : RequestHandler(true) {} - virtual ~GetMonomerBarrierHandler() {} - bool Handle(const std::string& var_name, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override { - VLOG(50) << "GetMonomerHandler recv " << var_name; - - rpc_server_->IncreaseVarBarrier(var_name); - - return true; - } -}; - -class CollectiveServer final { - public: - explicit CollectiveServer(const std::string& end_point, int fan_in); - - virtual ~CollectiveServer() {} - - void StartServer(); - - static CollectiveServer* GetInstance(const std::string& end_point, - int fan_in) { - std::call_once(init_flag_, [&]() { - if (collective_server_.get() == nullptr) { - collective_server_.reset(new CollectiveServer(end_point, fan_in)); - collective_server_->StartServer(); - } - }); - - return collective_server_.get(); - } - - std::shared_ptr GetRPCServer() { return rpc_server_; } - - void Stop(); - - private: - std::unique_ptr get_monomer_handler_; - std::unique_ptr get_barrier_handler_; - - std::shared_ptr rpc_server_; - std::shared_ptr server_thread_; - std::shared_ptr loop_thread_; - - bool ready_{false}; - - static std::once_flag init_flag_; - static std::shared_ptr collective_server_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc deleted file mode 100644 index 92b2eb4b51e59..0000000000000 --- a/paddle/fluid/operators/distributed/collective_server_test.cc +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/operators/distributed/collective_client.h" -#include "paddle/fluid/operators/distributed/collective_server.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -} // namespace paddle - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace distributed = paddle::operators::distributed; - -std::unique_ptr StartServer( - const std::string& ep, int fan_in, framework::Scope* scope, - platform::DeviceContext* dev_ctx) { - distributed::CollectiveServer* server = - distributed::CollectiveServer::GetInstance(ep, fan_in); - - auto rpc_server = server->GetRPCServer(); - rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable, - scope, dev_ctx); - - std::cout << "StartServer return" << std::endl; - return std::unique_ptr(server); -} - -std::unique_ptr GenerateVars(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - framework::Scope* scope = new framework::Scope(); - framework::Variable* var = scope->Var("var1"); - auto* slr = var->GetMutable(); - slr->set_height(20000); - - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - - tensor->Resize(framework::make_ddim({3, 1024})); - tensor->mutable_data(place); - - paddle::operators::math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 3; ++i) rows->push_back(i); - - std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr); - - return std::unique_ptr(scope); -} - -void Gather(const std::vector& vars, - platform::DeviceContext* dev_ctx) { - distributed::CollectiveClient* client = - distributed::CollectiveClient::GetInstance(); - - framework::Scope* scope = new framework::Scope(); - framework::Variable* var = scope->Var("var1"); - var->GetMutable(); - - std::vector dst; - client->Gather(vars, &dst, *dev_ctx, scope); - std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]); - dev_ctx->Wait(); - - ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024})); - ASSERT_EQ(dst[0]->height(), 20000); - ASSERT_EQ(dst[0]->rows().size(), static_cast(3)); - for (int i = 0; i < 3; i++) { - ASSERT_EQ(dst[0]->rows()[i], i); - } - - std::vector vec; - TensorToVector(dst[0]->value(), *dev_ctx, &vec); - for (size_t i = 0; i < 3 * 1024; i++) { - ASSERT_FLOAT_EQ(vec[i], 32.7); - } -} - -TEST(CollectiveServer, GPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - - platform::CUDAPlace place; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - std::string ep = "127.0.0.1:7164"; - auto scope = GenerateVars(place); - - auto* v1 = scope->FindVar("var1"); - std::cout << "var1:" << v1 << std::endl; - - auto server = StartServer(ep, 2, scope.get(), &ctx); - auto rpc_server = server->GetRPCServer(); - - distributed::RemoteVar var; - var.ep_ = ep; - var.var_name_ = "var1"; - var.trainer_id_ = 0; - - std::vector vars{var}; - Gather(vars, &ctx); - Gather(vars, &ctx); - - std::cout << "begin WaitVarBarrier" << std::endl; - rpc_server->WaitVarBarrier("var1"); - rpc_server->ClearRegisteredVars(); - server->Stop(); - - scope.release(); - server.release(); -} diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc deleted file mode 100644 index 4ee27a6414698..0000000000000 --- a/paddle/fluid/operators/distributed/communicator.cc +++ /dev/null @@ -1,989 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/distributed/communicator.h" - -#include - -#include -#include // NOLINT -#include -#include // NOLINT -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/parameter_recv.h" -#include "paddle/fluid/operators/distributed/parameter_send.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/split.h" - -namespace paddle { -namespace operators { -namespace distributed { - -using Tree = - std::map>>; -using RpcCtxMap = operators::distributed::RpcCtxMap; - -inline double GetCurrentUS() { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; -} - -Communicator::Communicator() {} - -std::once_flag Communicator::init_flag_; -std::shared_ptr Communicator::communicator_(nullptr); - -void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) { - send_varname_to_ctx_ = std::move(send_varname_to_ctx); - recv_varname_to_ctx_ = std::move(recv_varname_to_ctx); - recv_scope_ = std::move(recv_scope); - - if (send_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be send, will not start send_thread"; - } else { - send_scope_.reset(new Scope()); - for (auto &iter : send_varname_to_ctx_) { - if (iter.first == STEP_COUNTER && !need_global_step_) continue; - send_varname_to_queue_[iter.first] = - std::make_shared>>( - send_queue_size_); - } - send_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - if (recv_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be received, will not start recv_thread"; - } else { - recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - InitParams(); -} - -void AsyncCommunicator::InitParams() { RecvNoBarrier(); } - -AsyncCommunicator::~AsyncCommunicator() { - running_ = false; - if (main_thread_) main_thread_->join(); -} - -void AsyncCommunicator::SendGlobalStep(int batches) { - if (!need_global_step_) { - return; - } - - if (batches == 0) { - return; - } - - auto &var_name = STEP_COUNTER; - auto *out_var = send_scope_->Var(var_name); - auto *out_t = out_var->GetMutable(); - auto *data = out_t->mutable_data({1}, platform::CPUPlace()); - data[0] = static_cast(batches); - - auto &ctx = send_varname_to_ctx_.at(var_name); - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); -} - -void AsyncCommunicator::SendByCommunicator() { - std::vector> task_futures; - task_futures.reserve(send_varname_to_ctx_.size()); - VLOG(3) << "run send graph"; - - auto before_run_send_graph = GetCurrentUS(); - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - auto send_task = [this, &var_name, &var_queue] { - VLOG(3) << var_name << " merge and send; "; - std::vector> vars; - - int merged_var_num = 0; - int wait_times = 0; - while (merged_var_num < max_merge_var_num_) { - if (var_queue->Size() == 0) { - VLOG(4) << "wait_times -> " << wait_times; - if (wait_times >= send_wait_times_) { - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - wait_times++; - continue; - } else { - wait_times = 0; - - vars.push_back(var_queue->Pop()); - merged_var_num++; - } - } - auto before_merge = GetCurrentUS(); - if (var_name == STEP_COUNTER) { - SendGlobalStep(merged_var_num); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge and send " << merged_var_num << " " << var_name - << " use time " << after_merge - before_merge; - return; - } - - auto &ctx = send_varname_to_ctx_.at(var_name); - - MergeVars(var_name, vars, send_scope_.get(), ctx.merge_add); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge " << merged_var_num << " " << var_name << " use time " - << after_merge - before_merge; - - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); - auto after_send = GetCurrentUS(); - VLOG(3) << "send " << var_name << " use time " - << after_send - after_merge; - - if (var_name.rfind("@GRAD") != var_name.size() - 5) return; - - auto recv_param = var_name.substr(0, var_name.size() - 5); - if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end()) - return; - - auto recv_functor = distributed::ParameterRecv(); - recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_); - auto after_recv = GetCurrentUS(); - VLOG(3) << "recv " << recv_param << " use time " - << after_recv - after_send; - }; - task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task))); - } - for (auto &task_f : task_futures) { - task_f.wait(); - } - auto after_run_send_graph = GetCurrentUS(); - - VLOG(3) << "run send graph use time " - << (after_run_send_graph - before_run_send_graph); -} - -void HalfAsyncCommunicator::SendByCommunicator() { - std::vector> task_futures; - task_futures.reserve(send_varname_to_ctx_.size()); - VLOG(3) << "run send graph"; - - int batches = BatchesCounter(); - if (batches <= 0) return; - - auto before_run_send_graph = GetCurrentUS(); - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - auto send_task = [this, batches, &var_name, &var_queue] { - VLOG(3) << var_name << " merge and send; "; - auto before_task = GetCurrentUS(); - std::vector> vars; - vars.reserve(batches); - - for (int i = 0; i < batches; ++i) { - vars.push_back(var_queue->Pop()); - } - - if (var_name == STEP_COUNTER) { - SendGlobalStep(batches); - auto end_task = GetCurrentUS(); - VLOG(3) << "merge " << batches << " " << var_name << " use time " - << end_task - before_task; - return; - } - - auto &ctx = send_varname_to_ctx_.at(var_name); - - auto before_merge = GetCurrentUS(); - MergeVars(var_name, vars, send_scope_.get(), ctx.merge_add); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge " << batches << " " << var_name << " use time " - << after_merge - before_merge; - - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); - auto after_send = GetCurrentUS(); - VLOG(3) << "send " << var_name << " use time " - << after_send - before_task; - - if (var_name.rfind("@GRAD") != var_name.size() - 5) return; - - auto recv_param = var_name.substr(0, var_name.size() - 5); - if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end()) - return; - - auto recv_functor = distributed::ParameterRecv(); - recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_); - auto after_recv = GetCurrentUS(); - VLOG(3) << "recv " << recv_param << " use time " - << after_recv - after_send; - return; - }; - task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task))); - } - for (auto &task_f : task_futures) { - task_f.wait(); - } - auto after_run_send_graph = GetCurrentUS(); - - VLOG(3) << "run send graph use time " - << (after_run_send_graph - before_run_send_graph); -} - -void AsyncCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - SendByCommunicator(); - BarrierSend(); - } - VLOG(3) << "communicator stopped, send thread exit"; -} - -void HalfAsyncCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - SendByCommunicator(); - BarrierSend(); - RecvByCommunicator(); - BarrierRecv(); - BarrierWeakUp(); - } - VLOG(3) << "communicator stopped, send thread exit"; -} - -void AsyncCommunicator::RecvByCommunicator() { - VLOG(3) << "parallel run recv graph"; - if (!running_) return; - RecvNoBarrier(); - VLOG(3) << "run recv graph use time"; -} - -void AsyncCommunicator::RecvNoBarrier() { - std::vector> task_futures; - task_futures.reserve(recv_varname_to_ctx_.size()); - - for (auto &iter : recv_varname_to_ctx_) { - auto recv_task = [this, &iter] { - auto before_task = GetCurrentUS(); - auto &var_name = iter.first; - auto recv_functor = distributed::ParameterRecv(); - recv_functor(iter.second, *recv_scope_); - auto end_task = GetCurrentUS(); - VLOG(1) << "recv var " << var_name << " use time " - << (end_task - before_task); - }; - task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task))); - } - - for (auto &task : task_futures) { - task.wait(); - } -} - -void AsyncCommunicator::Start() { - VLOG(3) << "Communicator start"; - if (!communicator_) { - VLOG(0) << "Communicator is not inited, do nothing"; - } else { - VLOG(3) << "start send thread and recv thread"; - waiting_ = true; - running_ = true; - BarrierTriggerReset(max_merge_var_num_); - // start send and recv thread - main_thread_.reset( - new std::thread(std::bind(&AsyncCommunicator::MainThread, this))); - } -} - -void AsyncCommunicator::Stop() { - VLOG(3) << "Communicator stop"; - running_ = false; - if (!communicator_) { - VLOG(0) << "Communicator is not inited, do nothing"; - } else { - if (main_thread_) { - VLOG(3) << "stop send thread"; - main_thread_->join(); - main_thread_.reset(nullptr); - } - } - VLOG(3) << "Communicator stop done"; -} - -void AsyncCommunicator::Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) { - waiting_ = false; - - PADDLE_ENFORCE_EQ( - var_tables.size(), 1, - platform::errors::InvalidArgument("var_tables.size() == 1 is permitted")); - - auto table_name = var_tables[0]; - - if (table_name == STEP_COUNTER && !need_global_step_) return; - - auto before_send_op = GetCurrentUS(); - auto &queue = send_varname_to_queue_.at(table_name); - - if (table_name == STEP_COUNTER) { - auto tmp_var = std::make_shared(); - auto *tensor = tmp_var->GetMutable(); - tensor->Resize(framework::make_ddim({1})); - auto *out_d = tensor->mutable_data(platform::CPUPlace()); - out_d[0] = 1; - queue->Push(tmp_var); - } else { - PADDLE_ENFORCE_GE(var_names.size(), 1, - platform::errors::InvalidArgument( - "var_names.size() >= 1 is permitted")); - - auto *var = scope.FindVar(var_names[0]); - - PADDLE_ENFORCE_EQ( - var->IsInitialized(), true, - platform::errors::InvalidArgument("grad var should be inited")); - - auto tmp_var = std::make_shared(); - if (var->IsType()) { - framework::CopyVariable(*var, tmp_var.get()); - queue->Push(tmp_var); - } else if (var->IsType()) { - // push var into send queue by var_name - auto var_name = var_names[0]; - framework::CopyVariable(*var, tmp_var.get()); - queue->Push(tmp_var); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "unknown var type to copy, only support LoDTensor/SelectedRows")); - } - } - auto after_send_op = GetCurrentUS(); - VLOG(3) << "send to " << table_name << " with queue size " << queue->Size() - << ", use time " << (after_send_op - before_send_op); -} - -void HalfAsyncCommunicator::Clean() { - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - while (var_queue->Size() > 0) { - var_queue->Pop(); - } - - VLOG(3) << "clean var: " << var_name << " done"; - } -} - -int HalfAsyncCommunicator::BatchesCounter() { - while (running_) { - if (barrier_counter_.load() >= barrier_trigger_.load() && - barrier_trigger_.load() != 0) { - break; - } else { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - } - - return barrier_counter_.load(); -} - -void HalfAsyncCommunicator::Barrier() { - barrier_counter_++; - - if (!running_) { - VLOG(3) << "Communicator is not running, release barrier"; - return; - } - - { - std::unique_lock lk(barrier_mutex_); - barrier_cond_.wait(lk, [this] { return (barrier_counter_ == 0); }); - } -} - -void HalfAsyncCommunicator::BarrierTriggerDecrement() { - barrier_trigger_--; - VLOG(3) << "BarrierTriggerDecrement decrement barrier trigger to " - << barrier_trigger_.load(); -} - -void HalfAsyncCommunicator::BarrierTriggerReset(int initial_val) { - barrier_trigger_.store(initial_val); - - VLOG(3) << "BarrierTriggerReset reset barrier trigger to " - << barrier_trigger_.load(); -} - -void HalfAsyncCommunicator::BarrierWeakUp() { - barrier_counter_.store(0); - barrier_cond_.notify_all(); -} - -void SyncCommunicator::BarrierSend() { - if (!running_) return; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id_); - - std::vector rets; - - for (auto &ep : pserver_endpoints_) { - rets.push_back(rpc_client->AsyncSendBatchBarrier(ep)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External( - "internal error in RPCClient")); - } - - VLOG(4) << "BarrierSend with SyncCommunicator"; -} - -void SyncCommunicator::BarrierRecv() { - if (!running_) return; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id_); - - std::vector rets; - for (auto &ep : pserver_endpoints_) { - rets.push_back(rpc_client->AsyncSendFetchBarrier(ep)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External( - "internal error in RPCClient")); - } - - VLOG(4) << "BarrierRecv with SyncCommunicator"; -} - -void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) { - send_varname_to_ctx_ = std::move(send_varname_to_ctx); - recv_varname_to_ctx_ = std::move(recv_varname_to_ctx); - recv_scope_ = std::move(recv_scope); - - PADDLE_ENFORCE_GT( - send_varname_to_ctx.size(), 0, - platform::errors::InvalidArgument("send var contexts can not be zero")); - - send_scope_.reset(new Scope()); - for (auto &iter : send_varname_to_ctx_) { - auto &varname = iter.first; - - if (varname == STEP_COUNTER) { - send_varname_to_queue_[varname] = - std::make_shared>>( - send_queue_size_); - } else { - auto &send_ctx = iter.second; - - send_var_nums_ += send_ctx.splited_varnames.size(); - if (!send_ctx.is_sparse) { - continue; - } - int pserver_num = static_cast(send_ctx.epmap.size()); - for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) { - sparse_id_queues_.insert( - std::pair>>>>( - send_ctx.splited_varnames[ep_idx], - std::make_shared< - BlockingQueue>>>( - send_queue_size_))); - } - } - } - send_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - - if (recv_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be received, will not start recv_thread"; - } else { - recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - delta_scope_.reset(new Scope()); - old_scope_.reset(new Scope()); - pserver_scope_.reset(new Scope()); - - InitParams(); -} - -void GeoCommunicator::Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) { - waiting_ = false; - PADDLE_ENFORCE_EQ( - var_tables.size(), 1, - platform::errors::InvalidArgument("var_tables.size() == 1 is permitted")); - - auto table_name = var_tables[0]; - if (table_name == STEP_COUNTER) return; - - auto before_send = GetCurrentUS(); - size_t splited_var_nums = - send_varname_to_ctx_[table_name].splited_varnames.size(); - - std::unordered_map> ids_table; - - for (size_t j = 0; j < splited_var_nums; j++) { - ids_table.insert(std::pair>( - send_varname_to_ctx_[table_name].splited_varnames[j], - std::unordered_set())); - } - auto *var = scope.FindVar(var_names[0]); - auto &rows = var->Get().rows(); - - // insert ids which has not been record - for (size_t j = 0; j < rows.size(); j++) { - auto ep_idx = rows[j] % splited_var_nums; - ids_table.at(send_varname_to_ctx_[table_name].splited_varnames[ep_idx]) - .insert(rows[j]); - } - - auto before_push = GetCurrentUS(); - for (auto &iter : ids_table) { - auto &key = iter.first; - auto &sparse_ids_set = iter.second; - auto sparse_ids_vec = std::make_shared>(); - sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end()); - sparse_id_queues_.at(key)->Push(sparse_ids_vec); - VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key - << "'s queue"; - } - auto after_send = GetCurrentUS(); - VLOG(3) << "run send " << table_name << " op finish. using " - << (before_push - before_send) << "; " << (after_send - before_push); -} - -void GeoCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - std::vector> tasks; - tasks.reserve(send_var_nums_); - - for (auto &iter : send_varname_to_ctx_) { - auto &var_name = iter.first; - auto &send_ctx = iter.second; - int pserver_num = static_cast(send_ctx.epmap.size()); - if (send_ctx.is_sparse) { - for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) { - auto send_recv_task = [this, ep_idx, &var_name] { - auto before_send_sparse = GetCurrentUS(); - if (var_name == STEP_COUNTER) { - return; - } - auto send_varname = - send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx]; - auto sparse_ids = MergeSparseIds(send_varname); - if (sparse_ids.size() == 0) { - return; - } - SendSparse(var_name, ep_idx, sparse_ids); - auto after_send_sparse = GetCurrentUS(); - RecvSparse(var_name, ep_idx); - auto after_recv_sparse = GetCurrentUS(); - VLOG(3) - << "send recv " - << send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx] - << " finish, using " << (after_send_sparse - before_send_sparse) - << " and " << (after_recv_sparse - after_send_sparse) - << "; total = " << (after_recv_sparse - before_send_sparse); - }; - tasks.emplace_back( - send_threadpool_->enqueue(std::move(send_recv_task))); - } - } else { - auto send_recv_task = [this, &var_name, &send_ctx] { - if (var_name == STEP_COUNTER) { - return; - } - SendDense(var_name); - RecvDense(var_name); - }; - tasks.emplace_back( - send_threadpool_->enqueue(std::move(send_recv_task))); - } - } - for (auto &task : tasks) { - task.wait(); - } - } -} - -std::vector GeoCommunicator::MergeSparseIds( - const std::string &send_varname) { - size_t merge_num = 0, wait_times = 0; - std::unordered_set sparse_ids; - while (merge_num < static_cast(max_merge_var_num_)) { - VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num; - if (sparse_id_queues_.at(send_varname)->Size() > 0) { - wait_times = 0; - std::shared_ptr> pop_ids = - sparse_id_queues_.at(send_varname)->Pop(); - for (size_t j = 0; j < pop_ids->size(); j++) { - sparse_ids.insert(pop_ids->at(j)); - } - merge_num += 1; - VLOG(3) << "sparse_id_queues_(" << send_varname << ") pushed"; - } else if (sparse_id_queues_.at(send_varname)->Size() == 0) { - VLOG(3) << "wait_times -> " << wait_times; - if (wait_times >= static_cast(send_wait_times_)) { - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - wait_times++; - continue; - } - } - std::vector res; - res.assign(sparse_ids.begin(), sparse_ids.end()); - return res; -} -void GeoCommunicator::SendSparse(const std::string &varname, int ep_idx, - const std::vector &sparse_ids) { - auto &rpc_ctx = send_varname_to_ctx_.at(varname); - auto send_varname = rpc_ctx.splited_varnames[ep_idx]; - auto trainer_id = rpc_ctx.trainer_id; - auto endpoint = rpc_ctx.epmap[ep_idx]; - auto pserver_num = rpc_ctx.epmap.size(); - - auto *var_latest = recv_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - auto &t_latest = var_latest->Get(); - - auto dims1 = t_latest.dims()[1]; - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(send_varname); - auto *t_delta = var_delta->GetMutable(); - - auto *t_value = t_delta->mutable_value(); - t_value->mutable_data( - framework::make_ddim({static_cast(sparse_ids.size()), dims1}), - cpu_ctx.GetPlace()); - - std::vector *>> values; - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Get(sparse_ids, {"Param"}, &values); - - auto blas = math::GetBlas(cpu_ctx); - float coefficient = 1.0 / static_cast(trainers_); - - for (auto j = 0; j < static_cast(sparse_ids.size()); ++j) { - blas.VSUB(dims1, t_latest.data() + sparse_ids[j] * dims1, - values[j][0]->data(), t_value->data() + j * dims1); - blas.SCAL(dims1, coefficient, t_value->data() + j * dims1); - blas.VADD(dims1, values[j][0]->data(), t_value->data() + j * dims1, - values[j][0]->data()); - } - - std::vector send_rows; - send_rows.reserve(sparse_ids.size()); - for (auto idx : sparse_ids) { - send_rows.push_back(idx / pserver_num); - } - t_delta->set_height(rpc_ctx.height_sections[ep_idx]); - t_delta->set_rows(send_rows); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx_send = *pool.Get(platform::CPUPlace()); - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id); - - auto ret = rpc_client->AsyncSendVar(endpoint, cpu_ctx_send, - *delta_scope_.get(), send_varname); - ret->Wait(); -} - -void GeoCommunicator::SendDense(const std::string &varname) { - auto *var_latest = recv_scope_->FindVar(varname); - auto *var_timestamp = old_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - PADDLE_ENFORCE_EQ(var_timestamp->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - - auto &t_latest = var_latest->Get(); - auto t_timestamp = var_timestamp->GetMutable(); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(varname); - auto *t_delta = var_delta->GetMutable(); - t_delta->mutable_data(t_latest.dims(), cpu_ctx.GetPlace()); - - auto blas = math::GetBlas(cpu_ctx); - blas.VSUB(t_latest.numel(), t_latest.data(), - t_timestamp->data(), t_delta->data()); - - float coefficient = 1.0 / static_cast(trainers_); - blas.SCAL(t_latest.numel(), coefficient, t_delta->data()); - - blas.VADD(t_latest.numel(), t_timestamp->data(), - t_delta->data(), t_timestamp->data()); - - auto &ctx = send_varname_to_ctx_.at(varname); - auto send = distributed::ParameterSend(); - send(ctx, *delta_scope_, true, 1); -} - -void GeoCommunicator::RecvByCommunicator() { return; } - -void GeoCommunicator::RecvSparse(const std::string &varname, int ep_idx) { - auto train_id = recv_varname_to_ctx_.at(varname).trainer_id; - auto endpoint = recv_varname_to_ctx_.at(varname).epmap[ep_idx]; - auto splited_var_name = - recv_varname_to_ctx_.at(varname).splited_varnames[ep_idx]; - auto pserver_num = recv_varname_to_ctx_.at(varname).epmap.size(); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx_recv = *pool.Get(platform::CPUPlace()); - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(train_id); - - auto *var_psrever = pserver_scope_->Var(splited_var_name); - auto handle = rpc_client->AsyncGetVar(endpoint, cpu_ctx_recv, - *pserver_scope_.get(), splited_var_name, - splited_var_name, splited_var_name); - handle->Wait(); - - auto *var_latest = recv_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ( - var_psrever->IsInitialized(), true, - platform::errors::Unavailable( - "%s in pserver scope is not initialized, please check", varname)); - - std::vector ids; - ids.assign(var_psrever->Get().rows().begin(), - var_psrever->Get().rows().end()); - - for (size_t j = 0; j < ids.size(); j++) { - ids[j] = ids[j] * pserver_num + ep_idx; - } - - VLOG(3) << "RecvSparse receive var: " << splited_var_name - << " ids Size: " << ids.size(); - - auto t_psrever = var_psrever->Get().value(); - - std::vector *>> old_values; - - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Get(ids, {"Param"}, &old_values); - - auto *t_latest = var_latest->GetMutable(); - - auto dims1 = t_latest->dims()[1]; - auto numel = ids.size() * dims1; - - std::vector v_delta; - v_delta.resize(numel); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto blas = math::GetBlas(cpu_ctx); - - for (auto j = 0; j < static_cast(ids.size()); ++j) { - blas.VSUB(dims1, t_psrever.data() + j * dims1, - old_values[j][0]->data(), v_delta.data() + j * dims1); - blas.VADD(dims1, t_latest->data() + ids[j] * dims1, - v_delta.data() + j * dims1, - t_latest->data() + ids[j] * dims1); - blas.VCOPY(dims1, t_psrever.data() + j * dims1, - old_values[j][0]->data()); - } -} - -void GeoCommunicator::RecvDense(const std::string &varname) { - auto *var_latest = recv_scope_->FindVar(varname); - auto *var_timestamp = old_scope_->FindVar(varname); - auto *var_psrever = pserver_scope_->Var(varname); - - auto &ctx = recv_varname_to_ctx_.at(varname); - auto recv = distributed::ParameterRecv(); - recv(ctx, *pserver_scope_); - - PADDLE_ENFORCE_EQ( - var_psrever->IsInitialized(), true, - platform::errors::Unavailable( - "%s in pserver scope is not initialized, please check", varname)); - - auto t_psrever = var_psrever->Get(); - auto t_latest = var_latest->GetMutable(); - auto t_timestamp = var_timestamp->GetMutable(); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(varname); - auto *t_delta = var_delta->GetMutable(); - t_delta->mutable_data(t_latest->dims(), cpu_ctx.GetPlace()); - - auto blas = math::GetBlas(cpu_ctx); - blas.VSUB(t_latest->numel(), t_psrever.data(), - t_timestamp->data(), t_delta->data()); - blas.VADD(t_latest->numel(), t_latest->data(), t_delta->data(), - t_latest->data()); - blas.VCOPY(t_latest->numel(), t_psrever.data(), - t_timestamp->data()); -} - -void GeoCommunicator::InitParams() { - std::vector> tasks; - tasks.reserve(recv_varname_to_ctx_.size()); - - for (auto &iter : recv_varname_to_ctx_) { - auto &var_name = iter.first; - auto &recv_ctx = iter.second; - - auto recv_task = [this, &var_name, &recv_ctx] { - if (!recv_ctx.is_sparse) { - InitDense(var_name); - } - }; - tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task))); - } - - for (auto &task : tasks) { - task.wait(); - } - InitSparse(); -} - -void GeoCommunicator::InitDense(const std::string varname) { - auto &ctx = recv_varname_to_ctx_.at(varname); - auto recv = distributed::ParameterRecv(); - recv(ctx, *recv_scope_); - - auto *global_var = recv_scope_->FindVar(varname); - global_var->GetMutable(); - - auto *old_var = old_scope_->Var(varname); - old_var->GetMutable(); - - framework::CopyVariable(*global_var, old_var); - VLOG(1) << "init dense variable " << varname << " done"; -} - -void GeoCommunicator::InitSparse() { - auto sparse_metas = string::split_string(sparse_attrs_, "#"); - - std::vector metas; - std::vector dicts; - - for (auto &sparse_meta : sparse_metas) { - auto attrs = string::split_string(sparse_meta, ":"); - - auto meta = distributed::SparseMeta(); - meta.name = attrs[0]; - meta.value_names = {"Param"}; - - auto dic = string::split_string(attrs[1], ","); - dicts.push_back(std::stoi(dic[0])); - meta.value_dims = {std::stoi(dic[1])}; - meta.mode = distributed::Mode::training; - meta.grad_name = "none"; - meta.cached_varnames = {}; - meta.initializer_attrs = string::split_string(attrs[2]); - meta.entry = "none"; - - VLOG(3) << "add sparse meta: " << meta.ToString(); - metas.push_back(meta); - } - - LargeScaleKV::Init(metas); - - for (auto &meta : metas) { - auto &ctx = recv_varname_to_ctx_.at(meta.name); - auto recv = distributed::ParameterRecv(); - - auto *global_var = recv_scope_->FindVar(meta.name); - auto global_value = global_var->Get(); - auto rows = global_value.dims()[0]; - auto dim1 = global_value.dims()[1]; - - recv(ctx, *recv_scope_); - VLOG(1) << "recv " << meta.name << " with global scope for init"; - - auto n_rows = global_var->Get().dims()[0]; - - PADDLE_ENFORCE_EQ( - rows, n_rows, - platform::errors::InvalidArgument( - "global var: %s origin dim must equal recved rows", meta.name)); - - std::vector ids(rows); - std::iota(ids.begin(), ids.end(), 0); - - auto *ins = distributed::LargeScaleKV::GetInstance(); - std::vector *>> values; - - ins->Get(meta.name)->Init(ids); - ins->Get(meta.name)->Get(ids, {"Param"}, &values); - - auto blas = math::GetBlas( - paddle::platform::CPUDeviceContext()); - - for (auto &id : ids) { - blas.VCOPY(dim1, global_value.data() + id * dim1, - values[id][0]->data()); - } - } - - VLOG(3) << "init sparse variable done"; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h deleted file mode 100644 index 4be3253d3923f..0000000000000 --- a/paddle/fluid/operators/distributed/communicator.h +++ /dev/null @@ -1,490 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" -#include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" - -DECLARE_bool(communicator_is_sgd_optimizer); - -namespace paddle { -namespace operators { -namespace distributed { - -using Scope = framework::Scope; -using Variable = framework::Variable; - -template -class BlockingQueue { - public: - explicit BlockingQueue(size_t capacity) : capacity_(capacity) { - PADDLE_ENFORCE_GT(capacity_, 0, - platform::errors::InvalidArgument( - "The capacity must be greater than 0.")); - } - - bool Push(const T &elem) { - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return queue_.size() < capacity_; }); - PADDLE_ENFORCE_LT( - queue_.size(), capacity_, - platform::errors::OutOfRange("The queue size: %s out of capacity:%s", - queue_.size(), capacity_)); - queue_.push_back(elem); - } - cv_.notify_one(); - return true; - } - - bool Push(T &&elem) { - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return queue_.size() < capacity_; }); - PADDLE_ENFORCE_LT( - queue_.size(), capacity_, - platform::errors::OutOfRange("The queue size: %s out of capacity:%s", - queue_.size(), capacity_)); - queue_.emplace_back(std::move(elem)); - } - cv_.notify_one(); - return true; - } - - T Pop() { - std::unique_lock lock(mutex_); - cv_.wait(lock, [=] { return !queue_.empty(); }); - T rc(std::move(queue_.front())); - queue_.pop_front(); - cv_.notify_one(); - return rc; - } - - size_t Cap() const { - std::lock_guard lock(mutex_); - return capacity_; - } - - size_t Size() const { - std::lock_guard lock(mutex_); - return queue_.size(); - } - - private: - const size_t capacity_; - std::deque queue_; - - mutable std::mutex mutex_; - std::condition_variable cv_; -}; - -template -using EigenVector = framework::EigenVector; - -template -inline void MergeVars(const std::string &var_name, - const std::vector> &vars, - Scope *scope, bool merge_add = true) { - PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument( - "vector vars are empty.")); - auto cpu_place = platform::CPUPlace(); - auto &var0 = vars[0]; - auto *out_var = scope->Var(var_name); - if (var0->IsType()) { - auto dims = var0->Get().dims(); - VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims - << "; merge add: " << merge_add; - // init output tensor - auto *out_t = out_var->GetMutable(); - out_t->mutable_data(dims, cpu_place); - // check the input dims - for (auto &var : vars) { - auto &var_t = var->Get(); - PADDLE_ENFORCE_EQ( - var_t.dims(), dims, - platform::errors::InvalidArgument("vars should have the same dims")); - } - - // set output tensor to 0. - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - math::SetConstant constant_functor; - constant_functor(cpu_ctx, out_t, static_cast(0)); - // sum all vars to out - auto result = EigenVector::Flatten(*out_t); - for (auto &var : vars) { - auto &in_t = var->Get(); - auto in = EigenVector::Flatten(in_t); - result.device(*cpu_ctx.eigen_device()) = result + in; - } - if (!merge_add) { - result.device(*cpu_ctx.eigen_device()) = - result / static_cast(vars.size()); - } - } else if (var0->IsType()) { - auto &slr0 = var0->Get(); - auto *out_slr = out_var->GetMutable(); - out_slr->mutable_rows()->clear(); - out_slr->mutable_value()->mutable_data({{}}, cpu_place); - std::vector inputs; - inputs.reserve(vars.size()); - for (auto &var : vars) { - inputs.push_back(&var->Get()); - } - auto dev_ctx = paddle::platform::CPUDeviceContext(); - if (merge_add) { - math::scatter::MergeAdd merge_add; - merge_add(dev_ctx, inputs, out_slr); - } else { - math::scatter::MergeAverage - merge_average; - merge_average(dev_ctx, inputs, out_slr); - } - - VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() - << " dims: " << slr0.value().dims() << "; merge add: " << merge_add; - } else { - PADDLE_THROW(platform::errors::InvalidArgument("unsupported var type: %s!", - var0->Type())); - } -} - -using RpcCtxMap = std::unordered_map; -using SparseValue = std::unordered_map>; - -class Communicator { - public: - Communicator(); - - explicit Communicator(const std::map &envs_) { - for (auto &iter : envs_) { - envs[iter.first] = iter.second; - } - } - - virtual ~Communicator() {} - - virtual void Start() = 0; - - virtual void Stop() = 0; - - virtual bool IsRunning() { return running_; } - - virtual void Clean() {} - - virtual void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) = 0; - - virtual void RecvNoBarrier() {} - - virtual void Barrier() {} - - virtual void BarrierTriggerDecrement() {} - - virtual void BarrierTriggerReset(int init_counter) {} - - virtual void InitEnvs() = 0; - - virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) {} - - static Communicator *GetInstance() { return communicator_.get(); } - - static std::shared_ptr GetInstantcePtr() { - return communicator_; - } - - template - static Communicator *InitInstance( - const RpcCtxMap &send_ctx, const RpcCtxMap &recv_ctx, Scope *recv_scope, - const std::map &envs) { - std::call_once(init_flag_, &Communicator::InitWithRpcCtx, send_ctx, - recv_ctx, recv_scope, std::ref(envs)); - return communicator_.get(); - } - - // Init is called by InitInstance. - template - static void InitWithRpcCtx(const RpcCtxMap &send_ctx, - const RpcCtxMap &recv_ctx, Scope *recv_scope, - const std::map &envs) { - if (communicator_.get() == nullptr) { - communicator_.reset(new T(std::ref(envs))); - communicator_->InitEnvs(); - communicator_->InitImpl(send_ctx, recv_ctx, recv_scope); - } - } - - protected: - bool running_ = false; - bool waiting_ = true; - static std::shared_ptr communicator_; - static std::once_flag init_flag_; - std::unordered_map envs; -}; - -class AsyncCommunicator : public Communicator { - public: - AsyncCommunicator() : Communicator() {} - - explicit AsyncCommunicator(const std::map &envs) - : Communicator(envs) {} - - ~AsyncCommunicator(); - - void InitEnvs() { - min_send_grad_num_before_recv_ = - std::stoi(envs.at("communicator_min_send_grad_num_before_recv")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - VLOG(0) << "AsyncCommunicator Initialized"; - } - - void Start() override; - - void Stop() override; - - void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) override; - - void InitParams(); - - virtual void MainThread(); - - void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) override; - - virtual void SendByCommunicator(); - virtual void SendGlobalStep(int batches); - - virtual void RecvByCommunicator(); - - virtual void RecvNoBarrier(); - - virtual void BarrierSend() {} - - virtual void BarrierRecv() {} - - virtual void BarrierWeakUp() {} - - protected: - int min_send_grad_num_before_recv_; - int thread_pool_size_; - int max_merge_var_num_; - int send_wait_times_; - int send_queue_size_; - int trainer_id_ = 0; - bool need_global_step_ = false; - - std::unordered_map>>> - send_varname_to_queue_; - RpcCtxMap send_varname_to_ctx_; - RpcCtxMap recv_varname_to_ctx_; - std::unique_ptr main_thread_{nullptr}; - Scope *recv_scope_; // should be global scope - std::unique_ptr send_scope_; // an independent scope - std::unique_ptr<::ThreadPool> send_threadpool_{nullptr}; - std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr}; - std::atomic_uint grad_num_{0}; // the num of gradient sent since last recv -}; - -class HalfAsyncCommunicator : public AsyncCommunicator { - public: - HalfAsyncCommunicator() {} - - explicit HalfAsyncCommunicator(const std::map &envs) - : AsyncCommunicator(envs) {} - - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - VLOG(0) << "HalfAsyncCommunicator Initialized"; - } - - void MainThread() override; - - void SendByCommunicator() override; - - void Clean() override; - - void Barrier() override; - - void BarrierTriggerDecrement() override; - - void BarrierTriggerReset(int initial_val) override; - - int BatchesCounter(); - - void BarrierWeakUp(); - - protected: - // mutex for Wait for barrier - std::mutex barrier_mutex_; - std::condition_variable barrier_cond_; - std::atomic barrier_trigger_{0}; - std::atomic barrier_counter_{0}; -}; - -class SyncCommunicator : public HalfAsyncCommunicator { - public: - SyncCommunicator() : HalfAsyncCommunicator() {} - - explicit SyncCommunicator(const std::map &envs) - : HalfAsyncCommunicator(envs) {} - - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - - trainer_id_ = std::stoi(envs.at("trainer_id")); - auto pserver_strings = envs.at("pserver_endpoints"); - pserver_endpoints_ = paddle::string::Split(pserver_strings, ','); - VLOG(0) << "SyncCommunicator Initialized"; - } - - void BarrierSend(); - - void BarrierRecv(); - - private: - std::vector pserver_endpoints_{}; -}; - -class GeoCommunicator : public AsyncCommunicator { - public: - GeoCommunicator() : AsyncCommunicator() {} - - explicit GeoCommunicator(const std::map &envs) - : AsyncCommunicator(envs) {} - - void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) override; - void MainThread() override; - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - - send_queue_size_ = max_merge_var_num_; - trainers_ = std::stoi(envs.at("trainers")); - sparse_attrs_ = envs.at("sparse_attrs"); - VLOG(0) << "GeoCommunicator Initialized"; - } - - void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) override; - - void SendByCommunicator() { return; } - - std::vector MergeSparseIds(const std::string &send_varname); - - void SendSparse(const std::string &varname, int ep_idx, - const std::vector &sparse_ids); - - void SendDense(const std::string &varname); - - void SendGlobalStep(int batches) override {} - - void RecvByCommunicator() override; - - void RecvSparse(const std::string &varname, int ep_idx); - - void RecvDense(const std::string &varname); - - void InitParams(); - - void InitSparse(); - - void InitDense(const std::string varname); - - private: - int trainers_; - std::string sparse_attrs_; - - // parameter for delta calc and send - std::shared_ptr delta_scope_; - - // parameter for storage the pserver param after last recv - std::shared_ptr old_scope_; - - // parameter on pserver - std::shared_ptr pserver_scope_; - - int send_var_nums_ = 0; - - std::unordered_map> old_sparses_; - - std::unordered_map< - std::string, - std::shared_ptr>>>> - sparse_id_queues_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator_common.h b/paddle/fluid/operators/distributed/communicator_common.h deleted file mode 100644 index 122d904eba27a..0000000000000 --- a/paddle/fluid/operators/distributed/communicator_common.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -namespace paddle { -namespace operators { -namespace distributed { - -struct CommContext { - CommContext() = default; - - CommContext(const std::string &name, const std::vector &names, - const std::vector &emap, - const std::vector §ions, - const std::vector &origin_names, int id, - bool merge_add_ = true, bool is_sparse_ = true, - bool is_distributed_ = false) - : var_name(name), - splited_varnames(names), - epmap(emap), - height_sections(sections), - origin_varnames(origin_names), - trainer_id(id), - merge_add(merge_add_), - is_sparse(is_sparse_), - is_distributed(is_distributed_) {} - - CommContext(const CommContext &ctx) { - var_name = ctx.var_name; - splited_varnames = ctx.splited_varnames; - epmap = ctx.epmap; - height_sections = ctx.height_sections; - trainer_id = ctx.trainer_id; - merge_add = ctx.merge_add; - is_sparse = ctx.is_sparse; - origin_varnames = ctx.origin_varnames; - is_distributed = ctx.is_distributed; - } - - std::string print() const { - std::stringstream ss; - - ss << "varname: " << var_name << " trainer_id: " << trainer_id << " "; - - for (size_t i = 0; i < splited_varnames.size(); i++) { - ss << "slice varname: " << splited_varnames[i] << " ep: " << epmap[i] - << " section: " << height_sections[i] << " "; - } - - ss << "origin varnames: "; - for (size_t i = 0; i < origin_varnames.size(); i++) { - ss << origin_varnames[i] << " "; - } - - ss << " aggregation->add: " << merge_add << " "; - ss << " is_sparse: " << is_sparse << "\n"; - ss << " is_distributed: " << is_distributed << "\n"; - - return ss.str(); - } - - std::string var_name; - std::vector splited_varnames; - std::vector epmap; - std::vector height_sections; - std::vector origin_varnames; - int trainer_id; - bool merge_add; - bool is_sparse; - bool is_distributed; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc deleted file mode 100644 index 38b7c8b00317e..0000000000000 --- a/paddle/fluid/operators/distributed/communicator_test.cc +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "paddle/fluid/operators/distributed/communicator.h" - -namespace paddle { -namespace operators { -namespace distributed { - -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; - -TEST(communicator, merge_lod_tensors) { - auto cpu_place = platform::CPUPlace(); - auto dims = framework::make_ddim({2, 3}); - std::vector> in_vars; - float out_value = 0; - for (auto i = 0; i < 10; ++i) { - auto var = std::make_shared(); - in_vars.emplace_back(var); - auto *tensor = var->GetMutable(); - auto *data = tensor->mutable_data(dims, cpu_place); - for (auto j = 0; j < tensor->numel(); ++j) { - data[j] = static_cast(i); - } - out_value += static_cast(i); - } - const std::string out_name = "Out"; - std::unique_ptr scope; - scope.reset(new framework::Scope()); - scope->Var(out_name); - for (auto i = 0; i < 10; ++i) { - MergeVars(out_name, in_vars, scope.get()); - } - auto &out_tensor = scope->FindVar(out_name)->Get(); - auto *out_data = out_tensor.data(); - ASSERT_EQ(out_tensor.dims(), dims); - for (auto i = 0; i < out_tensor.numel(); ++i) { - ASSERT_EQ(out_data[i], out_value); - } -} - -TEST(communicator, merge_selected_rows) { - auto cpu_place = platform::CPUPlace(); - int64_t width = 10; - std::vector> in_vars; - const int64_t height = 100; - for (auto i = 0; i < 10; ++i) { - std::vector rows; - for (auto k = 0; k <= i; ++k) { - rows.push_back(k); - } - auto var = std::make_shared(); - in_vars.emplace_back(var); - auto *slr = var->GetMutable(); - slr->set_height(height); - slr->set_rows(rows); - auto dims = - framework::make_ddim({static_cast(rows.size()), width}); - auto *data = slr->mutable_value()->mutable_data(dims, cpu_place); - for (size_t i = 0; i < rows.size(); ++i) { - for (auto j = 0; j < width; ++j) { - data[i * width + j] = static_cast(rows[i]); - } - } - } - const std::string out_name = "Out"; - std::unique_ptr scope; - scope.reset(new framework::Scope()); - scope->Var(out_name); - for (auto i = 0; i < 10; ++i) { - MergeVars(out_name, in_vars, scope.get()); - } - auto &out_slr = scope->FindVar(out_name)->Get(); - auto &out_t = out_slr.value(); - auto *out_data = out_t.data(); - ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width})); - std::vector out_values; - out_values.reserve(10); - for (auto i = 0; i < 10; ++i) { - out_values.push_back(static_cast(i * (10 - i))); - } - for (size_t i = 0; i < out_slr.rows().size(); ++i) { - ASSERT_EQ(out_slr.rows()[i], static_cast(i)); - for (auto j = 0; j < width; ++j) { - ASSERT_EQ(out_data[i * width + j], out_values[i]); - } - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/distributed.h b/paddle/fluid/operators/distributed/distributed.h deleted file mode 100644 index 5917c18fb0d20..0000000000000 --- a/paddle/fluid/operators/distributed/distributed.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef PADDLE_WITH_DISTRIBUTE - -#ifdef PADDLE_WITH_GRPC -#include "paddle/fluid/operators/distributed/communicator.h" - -#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" -#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer -#define RPCCLIENT_T paddle::operators::distributed::GRPCClient - -#else // PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" -#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer -#define RPCCLIENT_T paddle::operators::distributed::BRPCClient - -#endif // PADDLE_WITH_GRPC - -#endif // PADDLE_WITH_DISTRIBUTE diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc deleted file mode 100644 index 7d6756b41363d..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -GrpcByteBufferSource::GrpcByteBufferSource() {} - -bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) { - cur_ = -1; - left_ = 0; - ptr_ = nullptr; - byte_count_ = 0; - bool ok = src.Dump(&slices_).ok(); - if (!ok) { - slices_.clear(); - } - return ok; -} - -bool GrpcByteBufferSource::Next(const void** data, int* size) { - // Use loop instead of if in case buffer contained empty slices. - while (left_ == 0) { - // Advance to next slice. - cur_++; - if (cur_ >= slices_.size()) { - return false; - } - const ::grpc::Slice& s = slices_[cur_]; - left_ = s.size(); - ptr_ = reinterpret_cast(s.begin()); - } - - *data = ptr_; - *size = left_; - byte_count_ += left_; - ptr_ += left_; - left_ = 0; - return true; -} - -void GrpcByteBufferSource::BackUp(int count) { - ptr_ -= count; - left_ += count; - byte_count_ -= count; -} - -bool GrpcByteBufferSource::Skip(int count) { - const void* data; - int size; - while (Next(&data, &size)) { - if (size >= count) { - BackUp(size - count); - return true; - } - // size < count; - count -= size; - } - // error or we have too large count; - return false; -} - -google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { - return byte_count_; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h deleted file mode 100644 index 486870de7a554..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#pragma once - -#include - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "grpc++/grpc++.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -struct grpc_byte_buffer; - -namespace grpc { -// A ZeroCopyInputStream that reads from grpc_byte_buffer -class ByteBuffer; - -class GrpcBufferReader final - : public ::google::protobuf::io::ZeroCopyInputStream { - typedef void (CoreCodegenInterface::*OldReaderInitAPI)( - grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); - typedef int (CoreCodegenInterface::*NewReaderInitAPI)( - grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); - void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader, - grpc_byte_buffer* buffer) { - (g_core_codegen_interface->*ptr)(reader, buffer); - } - void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader, - grpc_byte_buffer* buffer) { - int result = (g_core_codegen_interface->*ptr)(reader, buffer); - (void)result; - } - - public: - explicit GrpcBufferReader(grpc_byte_buffer* buffer) - : byte_count_(0), backup_count_(0) { - ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_, - buffer); - } - ~GrpcBufferReader() override { - g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_); - } - - bool Next(const void** data, int* size) override { - if (backup_count_ > 0) { - *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) - - backup_count_; - GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX); - *size = static_cast(backup_count_); - backup_count_ = 0; - return true; - } - if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_, - &slice_)) { - return false; - } - g_core_codegen_interface->grpc_slice_unref(slice_); - *data = GRPC_SLICE_START_PTR(slice_); - // On win x64, int is only 32bit - GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX); - byte_count_ += * size = static_cast(GRPC_SLICE_LENGTH(slice_)); - return true; - } - - void BackUp(int count) override { backup_count_ = count; } - - bool Skip(int count) override { - const void* data; - int size; - while (Next(&data, &size)) { - if (size >= count) { - BackUp(size - count); - return true; - } - // size < count; - count -= size; - } - // error or we have too large count; - return false; - } - - ::google::protobuf::int64 ByteCount() const override { - return byte_count_ - backup_count_; - } - - private: - int64_t byte_count_; - int64_t backup_count_; - grpc_byte_buffer_reader reader_; - grpc_slice slice_; -}; - -}; // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -// A ZeroCopyInputStream that reads from a grpc::ByteBuffer. -class GrpcByteBufferSource - : public ::google::protobuf::io::ZeroCopyInputStream { - public: - GrpcByteBufferSource(); - bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times. - bool Next(const void** data, int* size) override; - void BackUp(int count) override; - bool Skip(int count) override; - ::google::protobuf::int64 ByteCount() const override; - - private: - std::vector<::grpc::Slice> slices_; - size_t cur_; // Current slice index. - int left_; // Number of bytes in slices_[cur_] left to yield. - const char* ptr_; // Address of next byte in slices_[cur_] to yield. - ::google::protobuf::int64 byte_count_; -}; - -class GrpcByteBufferSourceWrapper : public Source { - public: - explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) - : source_(source) {} - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - return source_; - } - - private: - GrpcByteBufferSource* source_; -}; - -class GrpcByteSource : public Source { - public: - explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {} - ~GrpcByteSource() override { DeleteStream(); } - - typedef ::grpc::GrpcBufferReader Reader; - - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - DeleteStream(); - stream_ = new (&space_) Reader(buffer_); - return stream_; - } - - private: - void DeleteStream() { - if (stream_) { - stream_->~Reader(); - } - } - - grpc_byte_buffer* buffer_; // Not owned - Reader* stream_ = nullptr; // Points into space_ if non-nullptr - char space_[sizeof(Reader)]; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc deleted file mode 100644 index 97a9c14e4f185..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc +++ /dev/null @@ -1,671 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "glog/logging.h" // For VLOG -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/port.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_int32(rpc_client_threads, 2, ""); -DECLARE_bool(rpc_disable_reuse_port); - -namespace paddle { -namespace operators { -namespace distributed { - -void GRPCClient::InitImpl() { - // start the client process thread - // TODO(wuyi): can make this in a threadpool - client_threads_.resize(FLAGS_rpc_client_threads); - for (int i = 0; i < FLAGS_rpc_client_threads; i++) { - client_threads_[i].reset( - new std::thread(std::bind(&GRPCClient::Proceed, this))); - } -} - -void GRPCClient::SendComplete() { - std::unique_lock lk(completed_mutex_); - if (!completed_) { - for (auto& it : channels_) { - VLOG(3) << "send complete message to " << it.first; - this->AsyncSendComplete(it.first); - } - PADDLE_ENFORCE_EQ(this->Wait(), true, platform::errors::PreconditionNotMet( - "internal grpc service error.")); - completed_ = true; - } -} - -GRPCClient::~GRPCClient() { - stopped_ = true; - Wait(); - cq_.Shutdown(); - { - std::lock_guard guard(chan_mutex_); - for (auto& it : channels_) { - it.second.reset(); - } - channels_.clear(); - } - for (size_t i = 0; i < client_threads_.size(); i++) - client_threads_[i]->join(); -} - -VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kSendRPC; - - int retry_times_ = 0; - - while (true) { - SendProcessor* s = new SendProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] { - auto* var = p_scope->FindVar(var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = nullptr; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -void ProcGetResponse(const VarHandle& var_h, - const ::grpc::ByteBuffer& ret_msg) { - VLOG(4) << "ProcGetResponse"; - framework::Variable* outvar = nullptr; - // get response's trainer_id is not used - int trainer_id; - DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar, - &trainer_id); -} - -void ProcGetRecvResponse(const VarHandle& var_h, - const ::grpc::ByteBuffer& ret_msg) { - VLOG(4) << "ProcGetRecvResponse"; - framework::Variable* outvar = nullptr; - int trainer_id; - DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar, - &trainer_id); -} - -template -void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { - ::grpc::Slice slice(proto.ByteSizeLong()); - proto.SerializeWithCachedSizesToArray(const_cast(slice.begin())); - ::grpc::ByteBuffer tmp(&slice, 1); - result->Swap(&tmp); -} - -VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname, - "/sendrecv.SendRecvService/GetVariable", table_name, - time_out); -} - -VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, int64_t time_out) { - std::string var_name_no_barrier = - string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); - - return _AsyncGetVar( - ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname, - "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out); -} - -VarHandlePtr GRPCClient::AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name, - "/sendrecv.SendRecvService/GetMonomerVariable", "", - time_out); -} - -VarHandlePtr GRPCClient::_AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& method, - const std::string& var_name, const std::string& out_varname, - const std::string& rpc_path, const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const std::string out_varname_val = out_varname; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - - int retry_times_ = 0; - - while (true) { - GetProcessor* s = new GetProcessor(ch); - - VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, out_varname_val, table_name_val, s, method, - p_ctx, h, rpc_path, this] { - // prepare input - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_out_varname(out_varname_val); - req.set_trainer_id(trainer_id_); - req.set_table_name(table_name_val); - ::grpc::ByteBuffer buf; - RequestToByteBuffer(req, &buf); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = - s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string in_var_name_val = in_var_name; - const std::string out_var_name_val = out_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - - const std::string method = kPrefetchRPC; - int retry_times_ = 0; - - while (true) { - GetProcessor* s = new GetProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); - s->Prepare(h, kPrefetchTimeout); - - auto* var = p_scope->FindVar(in_var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val, - 0, table_name_val); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, static_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kBatchBarrierRPC; - VarHandlePtr h( - new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(BATCH_BARRIER_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - const std::string method = kFetchBarrierRPC; - VarHandlePtr h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(FETCH_BARRIER_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, - const std::string& var_name, - int64_t time_out) { - const auto ch = GetChannel(ep); - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kSendMonomerFetchBarrierRPC; - VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); - s->Prepare(h, time_out); - - VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; - - sendrecv::VariableMessage req; - req.set_varname(var_name); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kSendCompleteRPC; - VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_trainer_id(trainer_id_); - req.set_varname(COMPLETE_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dirname, - const std::string& varname, - const int mode, - int64_t time_out) { - const auto ch = GetChannel(ep); - - CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); - - const std::string method = kCheckPointNotifyRPC; - - VarHandlePtr h( - new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(varname); - req.set_table_name(std::to_string(mode)); - req.set_out_varname(dirname); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kRequestNotify; - - SendProcessor* s = new SendProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] { - auto* var = p_scope->FindVar(var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = nullptr; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/DistributeNotify", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - }); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& send_var_name, - const std::string& recv_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string send_var_name_val = send_var_name; - const std::string recv_var_name_val = recv_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kSendAndRecvRPC; - VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: " - << send_var_name_val << " Recv_var_name: " << recv_var_name_val; - int retry_times_ = 0; - - while (true) { - SendAndRecvProcessor* s = new SendAndRecvProcessor(ch); - VarHandlePtr h( - new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope)); - VarHandlePtr h_recv( - new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - s->RecvPrepare(h_recv); - - framework::Async([send_var_name_val, recv_var_name_val, table_name_val, - p_scope, p_ctx, s, method, h, this] { - auto* send_var = p_scope->FindVar(send_var_name_val); - send_var->GetMutable()->set_lod({}); - ::grpc::ByteBuffer buf; - VLOG(4) << "SerializeToByteBuffer: send_var_name_val: " - << send_var_name_val - << " recv_var_name_val: " << recv_var_name_val; - SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf, - recv_var_name_val, trainer_id_, table_name_val); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetRecvResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable", - buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -bool GRPCClient::Wait() { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); }); - return ok_; -} - -inline bool ShouldRetry(const std::string& method, int error_code) { - if (method == kPrefetchRPC) { - return true; - } - - if (error_code == grpc::StatusCode::DEADLINE_EXCEEDED) { - return true; - } - - return false; -} - -void GRPCClient::Proceed() { - void* tag = nullptr; - bool ok = false; - - VLOG(3) << "GRPCClient Proceed begin"; - while (!stopped_ && cq_.Next(&tag, &ok)) { - BaseProcessor* c = static_cast(tag); - GPR_ASSERT(ok); - PADDLE_ENFORCE_NOT_NULL( - c, platform::errors::PreconditionNotMet("Make BaseProcessor failed.")); - - if (c->status_.ok()) { - VLOG(3) << c->GetVarHandlePtr()->String() << " process"; - c->Process(); - } else if (ShouldRetry(c->GetVarHandlePtr()->method(), - c->status_.error_code())) { - VLOG(0) << c->GetVarHandlePtr()->String() - << " meets grpc error, error_code:" << c->status_.error_code() - << " error_message:" << c->status_.error_message() - << " error_details:" << c->status_.error_details() - << " should retry!"; - c->GetVarHandlePtr()->should_retry = true; - c->Finish(false); - } else { - PADDLE_THROW(platform::errors::External( - "%s meets grpc error, error_code is %d, error message is %s, error " - "details is %s.", - c->GetVarHandlePtr()->String(), c->status_.error_code(), - c->status_.error_message(), c->status_.error_details())); - c->Finish(false); - } - - bool notify = false; - { - std::lock_guard lk(sync_mutex_); - req_count_--; - notify = (req_count_ <= 0 || !c->status_.ok()); - } - - delete c; - - if (notify) { - sync_cond_.notify_all(); - } - } - - // Last log message - // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a - // static Mutex log_mutex is used for synchronization, which might have been - // destructed at this moment. - if (FLAGS_v >= 3) { - std::string msg("GRPCClient Proceed end"); - fwrite(msg.c_str(), msg.length(), 1, stderr); - } -} - -std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { - std::lock_guard guard(chan_mutex_); - auto it = channels_.find(ep); - if (it != channels_.end()) { - return it->second; - } - - // Channel configurations: - grpc::ChannelArguments args; - args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000); - if (FLAGS_rpc_disable_reuse_port) { - args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); - } - args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); - args.SetMaxSendMessageSize(std::numeric_limits::max()); - args.SetMaxReceiveMessageSize(std::numeric_limits::max()); - - auto ch = - grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args); - channels_[ep] = ch; - return ch; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h deleted file mode 100644 index 5885f944b60a1..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.h +++ /dev/null @@ -1,321 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include // NOLINT -#include // NOLINT -#include -#include -#include -#include -#include -#include // NOLINT -#include -#include // NOLINT -#include -#include - -#include "grpc++/channel.h" -#include "grpc++/generic/generic_stub.h" -#include "grpc++/grpc++.h" -#include "grpc++/support/byte_buffer.h" -#include "grpc++/support/slice.h" -#include "grpc/support/log.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN - -namespace grpc { -class Channel; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); - -void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); - -class BaseProcessor { - public: - BaseProcessor() { context_ = nullptr; } - - virtual ~BaseProcessor() {} - - virtual void Prepare(VarHandlePtr h, int64_t time_out) { - var_h_ = h; - - context_.reset(new grpc::ClientContext()); - context_->set_wait_for_ready(true); - if (time_out) { - std::chrono::system_clock::time_point deadline = - std::chrono::system_clock::now() + - std::chrono::milliseconds(time_out); - context_->set_deadline(deadline); - } - } - - void Process() { - ProcessImpl(); - var_h_->Finish(true); - } - - VarHandlePtr GetVarHandlePtr() { return var_h_; } - bool Wait() { return var_h_->Wait(); } - void Finish(bool ok) { return var_h_->Finish(ok); } - virtual void ProcessImpl() = 0; - - std::unique_ptr context_; - grpc::Status status_; - - protected: - VarHandlePtr var_h_; -}; - -typedef std::function - RequestSendCallBack; - -class SendProcessor : public BaseProcessor { - public: - explicit SendProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~SendProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_.get(), reply_); - } - } - - ::grpc::GenericStub stub_g_; - ::grpc::ByteBuffer reply_; - RequestSendCallBack response_call_back_ = nullptr; -}; - -typedef std::function - RequestGetCallBack; - -class GetProcessor : public BaseProcessor { - public: - explicit GetProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~GetProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_.get(), reply_); - } - } - - ::grpc::ByteBuffer reply_; - ::grpc::GenericStub stub_g_; - RequestGetCallBack response_call_back_ = ProcGetResponse; -}; - -class SendAndRecvProcessor : public BaseProcessor { - public: - explicit SendAndRecvProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~SendAndRecvProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_recv_.get(), reply_); - var_h_recv_->Finish(true); - } - } - - void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; } - - ::grpc::ByteBuffer reply_; - ::grpc::GenericStub stub_g_; - RequestGetCallBack response_call_back_ = ProcGetResponse; - VarHandlePtr var_h_recv_; -}; - -class BatchBarrierProcessor : public BaseProcessor { - public: - explicit BatchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~BatchBarrierProcessor() {} - - void ProcessImpl() override {} - sendrecv::VoidMessage reply_; - std::unique_ptr stub_; -}; - -class FetchBarrierProcessor : public BaseProcessor { - public: - explicit FetchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~FetchBarrierProcessor() {} - - void ProcessImpl() override {} - sendrecv::VariableMessage reply_; - std::unique_ptr stub_; -}; - -class CheckpointNotifyProcessor : public BaseProcessor { - public: - explicit CheckpointNotifyProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~CheckpointNotifyProcessor() {} - - void ProcessImpl() override {} - sendrecv::VoidMessage reply_; - std::unique_ptr stub_; -}; - -class GRPCClient : public RPCClient { - public: - GRPCClient() : ok_(true), completed_(false), stopped_(false) {} - virtual ~GRPCClient(); - - VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) override; - - VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendAndRecv(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& send_var_name, - const std::string& recv_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendComplete( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - bool Wait() override; - - void SendComplete() override; - - void InitImpl() override; - - private: - void Proceed(); - - std::shared_ptr GetChannel(const std::string& ep); - VarHandlePtr _AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& method, - const std::string& var_name, const std::string& out_varname, - const std::string& rpc_path, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline); - - private: - grpc::CompletionQueue cq_; - std::unordered_map> channels_; - std::vector> client_threads_; - - // mutex for Wait client sync - std::mutex sync_mutex_; - std::condition_variable sync_cond_; - std::atomic req_count_{0}; - bool ok_; - - // mutex for GetChannel thread safety - std::mutex chan_mutex_; - DISABLE_COPY_AND_ASSIGN(GRPCClient); - - // mutex for sending complete message only once - std::mutex completed_mutex_; - bool completed_; - - volatile bool stopped_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc deleted file mode 100644 index 0fc9b69577914..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_NCCL -#include -#endif -#ifdef PADDLE_WITH_RCCL -#include -#endif -#include -#include -#include "grpcpp/impl/codegen/byte_buffer.h" -#include "grpcpp/impl/codegen/slice.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/proto_encoder_helper.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -void SerializeToByteBuffer(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg, const std::string& out_name, - const int trainer_id, - const std::string& table_name) { - platform::RecordRPCEvent record_event("serial"); - VarMsg request; - TensorPayload* payload = nullptr; - - request.set_varname(name); - request.set_trainer_id(trainer_id); - // Note: normally the profiler is enabled in 1 trainer, hence only - // 1 trainer returns true for ShouldSendProfileState(). It tells PS - // servers the trainer's profiling state so that PS can follow the - // trainer. - if (platform::ShouldSendProfileState()) { - if (platform::IsProfileEnabled()) { - request.set_profile(platform::kEnableProfiler); - } else { - request.set_profile(platform::kDisableProfiler); - } - } - if (!out_name.empty()) { - request.set_out_varname(out_name); - } - if (!table_name.empty()) { - request.set_table_name(table_name); - } - if (var->IsType()) { - request.set_type(::sendrecv::LOD_TENSOR); - payload = new TensorPayload(GetTensorPayload(var, ctx, &request)); - } else if (var->IsType()) { - request.set_type(::sendrecv::SELECTED_ROWS); - payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request)); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - } else if (var->IsType()) { - request.set_type(::sendrecv::NCCL_ID); -#endif - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Serialize does not support type: %s", typeid(var->Type()).name())); - } - std::string header; - request.AppendToString(&header); - auto buffer = std::unique_ptr(new char[1024]); - void* buf = buffer.get(); - ProtoEncodeHelper e(static_cast(buf), 1024); - e.WriteRawBytes(std::string(header.data(), header.size())); -// NCCLID is copied directly to the message, return bytebuffer -// with only one slice if serializing NCCLID. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (var->IsType()) { - e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, - NCCL_UNIQUE_ID_BYTES); - const ncclUniqueId& uid = var->Get(); - e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES)); - - // for serialize NCCL_ID - ::grpc::Slice slices(e.size()); - memcpy(const_cast(slices.begin()), e.data(), e.size()); - ::grpc::ByteBuffer tmp(&slices, 1); - msg->Swap(&tmp); - return; - } -#endif - PADDLE_ENFORCE_NOT_NULL( - payload, - platform::errors::InvalidArgument( - "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS", - var->Type())); - e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, - payload->memory_size()); - if (payload->memory_size() >= std::numeric_limits::max()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Variable %s length %d should less than %d.", name, - payload->memory_size(), std::numeric_limits::max())); - } - // steal reference of tensor data - ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows - int num_slices = 2; // only SelectedRows have rows buffer - slices[0] = ::grpc::Slice(e.size()); - memcpy(const_cast(slices[0].begin()), e.data(), e.size()); - slices[1] = ::grpc::Slice( - grpc_slice_new_with_user_data(payload->ptr(), payload->memory_size(), - SerializeDestroyCallback, payload), - ::grpc::Slice::STEAL_REF); - - if (var->IsType()) { - auto* slr = var->GetMutable(); - ProtoEncodeHelper e2(static_cast(buf), 128); - - PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(), - platform::errors::InvalidArgument( - "Got wrong type %s, expect type: int64_t", - VectorElemName(slr->rows()))); - size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - - e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); - slices[2] = ::grpc::Slice(e2.size()); - memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); - - slices[3] = ::grpc::Slice( - grpc_slice_new_with_user_data( - const_cast( - reinterpret_cast(slr->rows().data())), - rows_memory_size, [](void* backing) {}, - const_cast( - reinterpret_cast(slr->rows().data()))), - ::grpc::Slice::STEAL_REF); - num_slices = 4; - } - ::grpc::ByteBuffer tmp(&slices[0], num_slices); - msg->Swap(&tmp); -} - -void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - platform::RecordRPCEvent record_event("deserial"); - operators::distributed::GRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(msg), 0, - platform::errors::InvalidArgument("parse bytebuffer to tensor error!")); - *var = resp.GetVar(); - *trainer_id = resp.GetTrainerId(); -} - -void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - platform::RecordRPCEvent record_event("deserial"); - operators::distributed::GRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(msg), 0, - platform::errors::InvalidArgument("parse bytebuffer to tensor error!")); - *var = resp.GetRecvVar(); - *trainer_id = resp.GetTrainerId(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h deleted file mode 100644 index 932f3e2f069a2..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/port.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -typedef void (*DestroyCallback)(void*); - -void SerializeToByteBuffer(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg, - const std::string& out_varname = std::string(), - const int trainer_id = 0, - const std::string& table_name = std::string()); - -void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc deleted file mode 100644 index d407a72938a74..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT - -#include "google/protobuf/text_format.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/printf.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace operators = paddle::operators; -namespace math = paddle::operators::math; -namespace memory = paddle::memory; - -void RunSerdeTestSelectedRows(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // serialize var to ByteBuffer - framework::Variable var; - auto* slr = var.GetMutable(); - slr->set_height(1000); - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({564, 128})); - tensor->mutable_data(place); - int tensor_numel = 564 * 128; - math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 564; ++i) rows->push_back(i); - - ::grpc::ByteBuffer msg; - operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg); - EXPECT_GT(msg.Length(), static_cast(0)); - - // deserialize - std::vector<::grpc::Slice> slices; - (void)msg.Dump(&slices); - std::string tmp; - for (const auto& s : slices) { - tmp.append(reinterpret_cast(s.begin()), s.size()); - } - - sendrecv::VariableMessage varmsg; - EXPECT_TRUE(varmsg.ParseFromString(tmp)); - - // deserialize bytebuffer - EXPECT_EQ(varmsg.varname(), "myvar"); - EXPECT_EQ(varmsg.type(), 1); - - const float* tensor_data = - reinterpret_cast(varmsg.serialized().data()); - const int64_t* rows_data = - reinterpret_cast(varmsg.rows().data()); - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data[i], 32.7); - } - for (int i = 0; i < 564; ++i) { - EXPECT_EQ(rows_data[i], i); - } - - // deserialize zero-copy - // framework::Variable var2; - // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2); - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::GRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto* slr2 = var2->GetMutable(); - auto* tensor2 = slr2->mutable_value(); - auto* rows2 = slr2->mutable_rows(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(*tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2->data()); - } - const int64_t* rows_data2 = rows2->data(); - - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); - } - for (size_t i = 0; i < rows2->size(); ++i) { - EXPECT_EQ(rows_data2[i], static_cast(i)); - } - EXPECT_EQ(slr2->height(), 1000); -} - -void RunTestLodTensor(platform::Place place, int from_type = 0) { - // serialize var to ByteBuffer - framework::Variable var; - auto* tensor = var.GetMutable(); - tensor->Resize(framework::make_ddim({512, 8, 4, 2})); - framework::LoD lod; - lod.push_back(framework::Vector({1, 3, 8})); - tensor->set_lod(lod); - int tensor_numel = 512 * 8 * 4 * 2; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 31.9); - - ::grpc::ByteBuffer msg; - operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg, - "outvar", 0, "table_name"); - EXPECT_GT(msg.Length(), static_cast(0)); - - // deserialize - std::vector<::grpc::Slice> slices; - (void)msg.Dump(&slices); - std::string tmp; - for (const auto& s : slices) { - tmp.append(reinterpret_cast(s.begin()), s.size()); - } - sendrecv::VariableMessage varmsg; - EXPECT_TRUE(varmsg.ParseFromString(tmp)); - EXPECT_EQ(varmsg.varname(), "myvar"); - EXPECT_EQ(varmsg.type(), 0); - EXPECT_EQ(varmsg.dims()[0], 512); - EXPECT_EQ(varmsg.dims()[1], 8); - EXPECT_EQ(varmsg.dims()[2], 4); - EXPECT_EQ(varmsg.dims()[3], 2); - EXPECT_EQ(varmsg.lod_level(), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); - EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); - - const float* tensor_data = - reinterpret_cast(varmsg.serialized().data()); - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data[i], 31.9); - } - - // message binary - std::string str; - varmsg.SerializeToString(&str); - - // message bytebuffer - ::grpc::Slice slices_2[1]; - int num_slices = 1; - slices_2[0] = ::grpc::Slice(str.length()); - memcpy(const_cast(slices_2[0].begin()), str.c_str(), str.length()); - ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices); - - // deserialize zero-copy - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::GRPCVariableResponse resp(&scope, &ctx); - if (from_type == 0) { - EXPECT_EQ(resp.Parse(msg), 0); - } else { - EXPECT_EQ(resp.Parse(bytebuffer2), 0); - } - - framework::Variable* var2 = resp.GetVar(); - - auto tensor2 = var2->Get(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2.data()); - } - - EXPECT_EQ(varmsg.lod_level(), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); - EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); - for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); -} - -TEST(LodTensor, Run) { - platform::CPUPlace place; - RunTestLodTensor(place); - RunTestLodTensor(place, 1); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu(0); - RunTestLodTensor(gpu); - RunTestLodTensor(gpu, 1); -#endif -} - -TEST(SelectedRows, Run) { - platform::CPUPlace place; - RunSerdeTestSelectedRows(place); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu; - RunSerdeTestSelectedRows(gpu); -#endif -} diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc deleted file mode 100644 index 912520d782d75..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ /dev/null @@ -1,720 +0,0 @@ -/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include - -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" - -namespace grpc { -class ChannelArguments; -} // namespace grpc -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -namespace operators { -namespace distributed { -class GRPCVariableResponse; -} // namespace distributed -} // namespace operators -} // namespace paddle - -using ::grpc::ServerAsyncResponseWriter; - -DECLARE_bool(rpc_disable_reuse_port); -DECLARE_int32(rpc_retry_bind_port); - -namespace paddle { -namespace operators { -namespace distributed { - -enum CallStatus { PROCESS = 0, FINISH }; - -// reference: -// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server -class RequestBase { - public: - explicit RequestBase(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : service_(service), - cq_(cq), - status_(PROCESS), - request_handler_(request_handler), - req_id_(req_id) { - PADDLE_ENFORCE_NOT_NULL(cq_, platform::errors::InvalidArgument( - "ServerCompletionQueue cq are empty")); - } - virtual ~RequestBase() {} - virtual void Process() = 0; - - std::string Status2String(const std::string& method) { - std::string status = "Process"; - if (status_ == FINISH) { - status = "Finish"; - } - - std::ostringstream s; - s << method << " name:[" << GetReqName() << "]" - << ", ep:[" << ctx_.peer() << "]" - << " " << status << " using req_id:" << req_id_; - return s.str(); - } - - CallStatus Status() const { - std::lock_guard l(status_mu_); - return status_; - } - - template - void Finish(const T& reply, ServerAsyncResponseWriter* responder) { - std::lock_guard l(status_mu_); - status_ = FINISH; - responder->Finish(reply, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); - } - virtual std::string GetReqName() = 0; - - protected: - mutable std::mutex status_mu_; - ::grpc::ServerContext ctx_; - GrpcService::AsyncService* service_; - ::grpc::ServerCompletionQueue* cq_; - CallStatus status_; - RequestHandler* request_handler_; - int req_id_; -}; - -class RequestSend final : public RequestBase { - public: - explicit RequestSend(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = static_cast(distributed::GrpcMethod::kSendVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - virtual ~RequestSend() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string varname = GetReqName(); - - auto scope = request_->GetMutableLocalScope(); - auto invar = request_->GetVar(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestSend var_name:" << varname << " trainer: " << trainer_id; - - framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); - Finish(reply_, &responder_); - } - - protected: - sendrecv::VoidMessage reply_; - std::shared_ptr request_; - ServerAsyncResponseWriter responder_; -}; - -class RequestGet final : public RequestBase { - public: - explicit RequestGet(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - auto method_id = static_cast(distributed::GrpcMethod::kGetVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGet() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - std::string out_varname = request_.out_varname(); - std::string table_name = request_.table_name(); - int trainer_id = request_.trainer_id(); - - VLOG(4) << "RequestGet " << out_varname << " from " << varname; - - auto scope = request_handler_->scope(); - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - tmp_scope_ = std::move(scope->NewTmpScope()); - request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar, - trainer_id, out_varname, table_name); - - VLOG(1) << "before SerializeToByteBuffer"; - if (outvar) { - SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), - &reply_); - } - VLOG(1) << "after SerializeToByteBuffer"; - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - std::unique_ptr tmp_scope_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -class RequestGetNoBarrier final : public RequestBase { - public: - explicit RequestGetNoBarrier(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetVariableNoBarrier); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetNoBarrier() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - std::string out_varname = request_.out_varname(); - int trainer_id = request_.trainer_id(); - - VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname; - - auto scope = request_handler_->scope(); - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), - &reply_); - } - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -class RequestGetMonomerVariable final : public RequestBase { - public: - explicit RequestGetMonomerVariable(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, - int req_id, RPCServer* rpc_server) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - rpc_server_(rpc_server) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetMonomerVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetMonomerVariable() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - - rpc_server_->WaitVarCond(varname); - MonomerHandle h = rpc_server_->GetMonomer(varname); - - auto scope = h.scope_; - auto invar = scope->FindVar(varname); - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, - request_.trainer_id()); - - if (outvar) { - SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_); - } - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; - RPCServer* rpc_server_{nullptr}; -}; - -class RequestGetMonomerBarrier final : public RequestBase { - public: - explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id, - RPCServer* rpc_server) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - rpc_server_(rpc_server) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetMonomerBarrier); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetMonomerBarrier() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - VLOG(4) << "RequestGetMonomerBarrier " << varname; - - rpc_server_->WaitVarCond(varname); - MonomerHandle h = rpc_server_->GetMonomer(varname); - - framework::Scope* scope = nullptr; - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, - request_.trainer_id()); - - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - sendrecv::VoidMessage reply_; - ServerAsyncResponseWriter responder_; - RPCServer* rpc_server_{nullptr}; -}; - -class RequestPrefetch final : public RequestBase { - public: - explicit RequestPrefetch(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - local_scope_(nullptr) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = - static_cast(distributed::GrpcMethod::kPrefetchVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestPrefetch() {} - - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - // prefetch process... - std::string in_var_name = request_->Varname(); - std::string out_var_name = request_->OutVarname(); - std::string table_name = request_->TableName(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name - << " out_var_name: " << out_var_name << " trainer: " << trainer_id; - - auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - // out var must be created in local scope! - framework::Variable* outvar = scope->Var(out_var_name); - - request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - - SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), - &reply_); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; - framework::Scope* local_scope_; -}; - -class RequestCheckpointNotify final : public RequestBase { - public: - explicit RequestCheckpointNotify(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx())); - int method_id = - static_cast(distributed::GrpcMethod::kCheckpointNotify); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestCheckpointNotify() {} - - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - auto scope = request_->GetMutableLocalScope(); - - std::string checkpoint_notify = request_->Varname(); - std::string checkpoint_dir = request_->OutVarname(); - int trainer_id = request_->GetTrainerId(); - std::string table_name = request_->TableName(); - - VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify - << ", dir: " << checkpoint_dir; - - request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr, - trainer_id, checkpoint_dir, table_name); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - sendrecv::VoidMessage reply_; - ServerAsyncResponseWriter responder_; -}; - -class RequestNotify final : public RequestBase { - public: - explicit RequestNotify(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = static_cast(distributed::GrpcMethod::kRequestNotify); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - virtual ~RequestNotify() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string varname = GetReqName(); - VLOG(4) << "RequestNotify var_name:" << varname; - - auto scope = request_->GetMutableLocalScope(); - auto invar = request_->GetVar(); - int trainer_id = request_->GetTrainerId(); - framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); - Finish(reply_, &responder_); - } - - protected: - sendrecv::VoidMessage reply_; - std::shared_ptr request_; - ServerAsyncResponseWriter responder_; -}; - -class RequestSendAndRecv final : public RequestBase { - public: - explicit RequestSendAndRecv(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - - int method_id = - static_cast(distributed::GrpcMethod::kRequestSendAndRecv); - - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestSendAndRecv() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string in_var_name = request_->Varname(); - std::string out_var_name = request_->OutVarname(); - std::string table_name = request_->TableName(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name - << " out_var_name: " << out_var_name << " trainer: " << trainer_id; - auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - framework::Variable* outvar = nullptr; - request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), - &reply_); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -void AsyncGRPCServer::WaitServerReady() { - VLOG(4) << "AsyncGRPCServer is waiting server ready"; - std::unique_lock lock(this->mutex_ready_); - condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(4) << "AsyncGRPCServer WaitSeverReady"; -} - -// Define an option subclass in order to disable SO_REUSEPORT for the -// server socket. -// Come from: -// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc -class NoReusePortOption : public ::grpc::ServerBuilderOption { - public: - void UpdateArguments(::grpc::ChannelArguments* args) override { - args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); - } - - void UpdatePlugins(std::vector>* - plugins) override {} -}; - -void AsyncGRPCServer::StartServer() { - for (int i = 0; i < FLAGS_rpc_retry_bind_port; i++) { - ::grpc::ServerBuilder builder; - std::unique_ptr service( - new GrpcService::AsyncService()); - builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(), - &selected_port_); - - builder.SetMaxSendMessageSize(std::numeric_limits::max()); - builder.SetMaxReceiveMessageSize(std::numeric_limits::max()); - if (FLAGS_rpc_disable_reuse_port) { - builder.SetOption( - std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption)); - LOG(INFO) << "set FLAGS_rpc_disable_reuse_port"; - } - builder.RegisterService(service.get()); - - for (auto t : rpc_call_map_) { - rpc_cq_[t.first].reset(builder.AddCompletionQueue().release()); - } - - server_ = builder.BuildAndStart(); - if (selected_port_ != 0) { - LOG(INFO) << "Server listening on " << bind_address_ - << " successful, selected port: " << selected_port_; - service_.reset(service.release()); - break; - } - - LOG(WARNING) << "Server listening on " << bind_address_ - << " failed, selected port: " << selected_port_ - << ", retry after 3 seconds!"; - - sleep(3); - } - - PADDLE_ENFORCE_NE( - selected_port_, 0, - platform::errors::Unavailable("can't bind to address:%s", bind_address_)); - - std::function f = - std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this, - std::placeholders::_1, std::placeholders::_2); - - for (auto& t : rpc_call_map_) { - auto& rpc_name = t.first; - auto& cq = rpc_cq_[rpc_name]; - auto threadnum = rpc_thread_num_[rpc_name]; - auto& reqs = rpc_reqs_[rpc_name]; - - reqs.reserve(kRequestBufSize); - - for (int i = 0; i < kRequestBufSize; i++) { - VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i; - TryToRegisterNewOne(rpc_name, i); - } - - for (int i = 0; i < threadnum; i++) { - rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind( - &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f))); - VLOG(4) << t.first << " creates threads!"; - } - } - - { - std::lock_guard lock(this->mutex_ready_); - ready_ = 1; - } - condition_ready_.notify_all(); - - // wait server - server_->Wait(); - - for (auto& t : rpc_threads_) { - auto& threads = t.second; - for (size_t i = 0; i < threads.size(); ++i) { - threads[i]->join(); - VLOG(4) << t.first << " threads ends!"; - } - } -} - -void AsyncGRPCServer::ShutdownQueue() { - for (auto& t : rpc_cq_) { - t.second->Shutdown(); - VLOG(4) << t.first << " queue shutdown!"; - } -} - -void AsyncGRPCServer::ShutDownImpl() { - std::unique_lock lock(cq_mutex_); - is_shut_down_ = true; - ShutdownQueue(); - - VLOG(4) << "server_ shutdown!"; - server_->Shutdown(); -} - -void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, - int req_id) { - std::unique_lock lock(cq_mutex_); - if (is_shut_down_) { - VLOG(4) << "shutdown, do not TryToRegisterNewSendOne"; - return; - } - - VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name - << " REQ ID: " << req_id; - - auto& reqs = rpc_reqs_[rpc_name]; - auto& handler = rpc_call_map_[rpc_name]; - auto& cq = rpc_cq_[rpc_name]; - - RequestBase* b = nullptr; - if (rpc_name == kRequestSend) { - b = new RequestSend(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestGet) { - b = new RequestGet(service_.get(), cq.get(), handler, req_id); - - } else if (rpc_name == kRequestGetNoBarrier) { - b = new RequestGetNoBarrier(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestGetMonomerVariable) { - b = new RequestGetMonomerVariable(service_.get(), cq.get(), handler, req_id, - this); - } else if (rpc_name == kRequestGetMonomerBarrier) { - b = new RequestGetMonomerBarrier(service_.get(), cq.get(), handler, req_id, - this); - } else if (rpc_name == kRequestPrefetch) { - b = new RequestPrefetch(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestCheckpoint) { - b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestNotify) { - b = new RequestNotify(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestSendAndRecv) { - b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("not supported rpc: %s", rpc_name)); - } - - reqs[req_id] = b; - - VLOG(4) << "TryToRegisterNewOne status:" << b->Status(); -} - -void AsyncGRPCServer::HandleRequest( - ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, - std::function TryToRegisterNewOne) { - void* tag = NULL; - bool ok = false; - - while (true) { - VLOG(4) << "HandleRequest " << rpc_name << " wait next"; - if (!cq->Next(&tag, &ok)) { - VLOG(4) << "CompletionQueue " << rpc_name << " shutdown!"; - break; - } - - int req_id = static_cast(reinterpret_cast(tag)); - VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id - << " get next"; - - auto& reqs = rpc_reqs_[rpc_name]; - RequestBase* base = nullptr; - { - PADDLE_ENFORCE_EQ( - (req_id >= 0 && req_id < kRequestBufSize), true, - platform::errors::OutOfRange("request id: %s out of bounds: [0, %s)", - req_id, kRequestBufSize)); - std::unique_lock lock(cq_mutex_); - base = reqs[req_id]; - } - - VLOG(3) << base->Status2String(rpc_name); - - // reference: - // https://github.com/tensorflow/tensorflow/issues/5596 - // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM - // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I - if (!ok) { - VLOG(4) << "completion queue:" << rpc_name << " recv no regular event" - << " context:" << base->Status2String(rpc_name); - TryToRegisterNewOne(rpc_name, req_id); - delete base; - continue; - } - - switch (base->Status()) { - case PROCESS: { - base->Process(); - break; - } - case FINISH: { - TryToRegisterNewOne(rpc_name, req_id); - delete base; - break; - } - default: { assert(false); } - } - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h deleted file mode 100644 index 3d68b7e8cebb4..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include // NOLINT -#include -#include - -#include "grpc++/grpc++.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_service.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/profiler.h" - -namespace grpc { -class ServerCompletionQueue; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestBase; - -class AsyncGRPCServer final : public RPCServer { - public: - explicit AsyncGRPCServer(const std::string& address, int client_num) - : RPCServer(address, client_num), ready_(0) {} - - virtual ~AsyncGRPCServer() {} - void WaitServerReady() override; - void StartServer() override; - - private: - // HandleRequest needs to be thread-safe. - void HandleRequest( - ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, - std::function TryToRegisterNewOne); - - void TryToRegisterNewOne(const std::string& rpc_name, int req_id); - void ShutdownQueue(); - void ShutDownImpl() override; - - private: - static const int kRequestBufSize = 100; - - std::mutex cq_mutex_; - volatile bool is_shut_down_ = false; - - std::unique_ptr service_; - std::unique_ptr<::grpc::Server> server_; - - // condition of the sub program - std::condition_variable barrier_condition_; - - std::mutex mutex_ready_; - std::condition_variable condition_ready_; - - int ready_; - - std::map> rpc_cq_; - std::map>> rpc_threads_; - std::map> rpc_reqs_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h deleted file mode 100644 index 10037c90853de..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_service.h +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/platform/profiler.h" - -// NOTE: This method was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// method and did some modifications so that we can parse gRPC -// requests without too much copying of the tensor data. - -namespace grpc { -class CompletionQueue; -class Channel; -class RpcService; -class ServerCompletionQueue; -class ServerContext; - -// Support parsing/unparsing of tensorflow::VariableResponse. -// Wire-format is identical to RecvVariableResponse. -template <> -class SerializationTraits< - paddle::operators::distributed::GRPCVariableResponse> { - public: - static Status Serialize( - const paddle::operators::distributed::GRPCVariableResponse& msg, - grpc_byte_buffer** bp, bool* own_buffer) { - PADDLE_THROW(paddle::platform::errors::Unimplemented( - "SerializationTraits::Serialize not implemented!")); - return Status(); - } - static Status Deserialize( - grpc_byte_buffer* buffer, - paddle::operators::distributed::GRPCVariableResponse* msg, - int max_message_size = INT_MAX) { - if (buffer == nullptr) { - return Status(StatusCode::INTERNAL, "No payload"); - } - - Status result = g_core_codegen_interface->ok(); - if (result.ok()) { - paddle::operators::distributed::GrpcByteSource source(buffer); - int ret = msg->Parse(&source); - if (ret != 0) { - result = Status(StatusCode::INTERNAL, "VariableResponse parse error"); - } - } - g_core_codegen_interface->grpc_byte_buffer_destroy(buffer); - return result; - } -}; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -enum class GrpcMethod { - kSendVariable, - kGetVariable, - kPrefetchVariable, - kCheckpointNotify, - kGetVariableNoBarrier, - kGetMonomerVariable, - kGetMonomerBarrier, - kRequestNotify, - kRequestSendAndRecv, - // when you add new handler, change kGrpcNumMethods at the same time! -}; - -static const int kGrpcNumMethods = - static_cast(GrpcMethod::kRequestSendAndRecv) + 1; - -inline const char* GrpcMethodName(GrpcMethod id) { - switch (id) { - case GrpcMethod::kSendVariable: - return "/sendrecv.SendRecvService/SendVariable"; - case GrpcMethod::kGetVariable: - return "/sendrecv.SendRecvService/GetVariable"; - case GrpcMethod::kGetVariableNoBarrier: - return "/sendrecv.SendRecvService/GetVariableNoBarrier"; - case GrpcMethod::kGetMonomerVariable: - return "/sendrecv.SendRecvService/GetMonomerVariable"; - case GrpcMethod::kGetMonomerBarrier: - return "/sendrecv.SendRecvService/GetMonomerBarrier"; - case GrpcMethod::kPrefetchVariable: - return "/sendrecv.SendRecvService/PrefetchVariable"; - case GrpcMethod::kCheckpointNotify: - return "/sendrecv.SendRecvService/CheckpointNotify"; - case GrpcMethod::kRequestNotify: - return "/sendrecv.SendRecvService/DistributeNotify"; - case GrpcMethod::kRequestSendAndRecv: - return "/sendrecv.SendRecvService/SendAndRecvVariable"; - } - - // Shouldn't be reached. - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid id: not found valid method name")); - return nullptr; -} - -class GrpcService final { - public: - class AsyncService : public ::grpc::Service { - public: - AsyncService() { - for (int i = 0; i < kGrpcNumMethods; ++i) { - AddMethod(new ::grpc::internal::RpcServiceMethod( - GrpcMethodName(static_cast(i)), - ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr)); - ::grpc::Service::MarkMethodAsync(i); - } - } - virtual ~AsyncService() {} - - // Make RequestAsyncUnary public for grpc_call.h - using ::grpc::Service::RequestAsyncUnary; - }; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc deleted file mode 100644 index f7679e9fc924d..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc +++ /dev/null @@ -1,344 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "google/protobuf/io/coded_stream.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" - -namespace google { -namespace protobuf { -namespace io { -class ZeroCopyInputStream; -} // namespace io -} // namespace protobuf -} // namespace google -namespace grpc { -class ByteBuffer; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -enum WireType { - WIRETYPE_VARINT = 0, - WIRETYPE_LENGTH_DELIMITED = 2, -}; - -inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; } - -inline WireType GetTagWireType(uint32_t tag) { - return static_cast(tag & 0x7); -} - -bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input, - int* result) { - uint64_t v; - if (input->ReadVarint64(&v) && v <= static_cast(INT_MAX)) { - *result = static_cast(v); - return true; - } else { - return false; - } -} - -int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) { - GrpcByteBufferSource source; - source.Init(byte_buffer); - GrpcByteBufferSourceWrapper r(&source); - - return Parse(&r); -} - -bool ParseLodData(::google::protobuf::io::CodedInputStream* input, - std::vector* lod) { - while (true) { - auto p = input->ReadTagWithCutoff(127); - int tag = GetTagFieldNumber(p.first); - WireType wt = GetTagWireType(p.first); - - if (!p.second) { - return (tag == 0); - } - - switch (tag) { - case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: { - uint64_t v; - if (wt == WIRETYPE_VARINT) { - if (!input->ReadVarint64(&v)) { - return false; - } - lod->push_back(v); - break; - } - - if (wt == WIRETYPE_LENGTH_DELIMITED) { - int num_bytes = 0; - if (!input->ReadVarintSizeAsInt(&num_bytes)) { - return tag; - } - int start_pos = input->CurrentPosition(); - while (input->CurrentPosition() - start_pos < num_bytes) { - uint64_t v; - if (!input->ReadVarint64(&v)) { - return tag; - } - lod->push_back(v); - } - break; - } - - return false; - } - default: { return false; } - } - } - - return true; -} - -int GRPCVariableResponse::Parse(Source* source) { - ::google::protobuf::io::ZeroCopyInputStream* input_stream = - source->contents(); - ::google::protobuf::io::CodedInputStream input(input_stream); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); - - while (true) { - auto p = input.ReadTagWithCutoff(127); - int tag = GetTagFieldNumber(p.first); - WireType wt = GetTagWireType(p.first); - if (!p.second) { - if (tag != 0) { - return -1; - } - return 0; - } - - switch (tag) { - case sendrecv::VariableMessage::kVarnameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_varname(temp); - break; - } - case sendrecv::VariableMessage::kTypeFieldNumber: { - uint32_t v; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { - return tag; - } - - meta_.set_type(static_cast<::sendrecv::VarType>(v)); - break; - } - case sendrecv::VariableMessage::kDataTypeFieldNumber: { - uint32_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { - return tag; - } - - meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v)); - break; - } - case sendrecv::VariableMessage::kDimsFieldNumber: { - // not packed - if (wt == WIRETYPE_VARINT) { - uint64_t v; - if (!input.ReadVarint64(&v)) { - return tag; - } - meta_.add_dims(v); - break; - } - - // packed - if (wt == WIRETYPE_LENGTH_DELIMITED) { - int num_bytes = 0; - if (!input.ReadVarintSizeAsInt(&num_bytes)) { - return tag; - } - int start_pos = input.CurrentPosition(); - while (input.CurrentPosition() - start_pos < num_bytes) { - uint64_t v; - if (!input.ReadVarint64(&v)) { - return tag; - } - meta_.add_dims(v); - } - break; - } - return tag; - } - case sendrecv::VariableMessage::kLodLevelFieldNumber: { - uint64_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { - return tag; - } - meta_.set_lod_level(static_cast(v)); - break; - } - case sendrecv::VariableMessage::kLodFieldNumber: { - int length = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &length)) { - return tag; - } - - std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p = - input.IncrementRecursionDepthAndPushLimit(length); - - std::vector lod_data; - if (p.second < 0 || !ParseLodData(&input, &lod_data)) { - return tag; - } - - if (!input.DecrementRecursionDepthAndPopLimit(p.first)) { - return tag; - } - - if (lod_data.size() == 0) { - break; - } - - auto lod = meta_.add_lod(); - for (uint32_t i = 0; i < lod_data.size(); i++) { - lod->add_lod_data(lod_data[i]); - } - break; - } - case sendrecv::VariableMessage::kSlrHeightFieldNumber: { - uint64_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { - return tag; - } - meta_.set_slr_height(static_cast(v)); - break; - } - case sendrecv::VariableMessage::kSerializedFieldNumber: { - int num_bytes = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &num_bytes)) { - return tag; - } - - if (!ProcSerializedField(tag, &input, num_bytes)) { - return tag; - } - - break; - } - case sendrecv::VariableMessage::kRowsFieldNumber: { - PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || - meta_.type() == sendrecv::LOD_TENSOR) && - meta_.varname() != "", - platform::errors::PreconditionNotMet( - "meta info should be got first!")); - - int num_bytes = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &num_bytes)) { - return tag; - } - - if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { - return tag; - } - break; - } - case sendrecv::VariableMessage::kOutVarnameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_out_varname(temp); - break; - } - case sendrecv::VariableMessage::kProfileFieldNumber: { - uint64_t profiling = 0; - if (!input.ReadVarint64(&profiling)) { - return tag; - } - meta_.set_profile(profiling); - int64_t listener_id = platform::ListenerId(); - if (listener_id <= 0) { - break; - } - if (profiling == platform::kEnableProfiler && - !platform::IsProfileEnabled()) { - platform::EnableProfiler(platform::ProfilerState::kCPU); - } else if (profiling == platform::kDisableProfiler && - platform::IsProfileEnabled()) { - platform::DisableProfiler( - platform::EventSortingKey::kDefault, - string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path, - listener_id)); - } - break; - } - case sendrecv::VariableMessage::kTrainerIdFieldNumber: { - uint64_t trainer_id = 0; - if (!input.ReadVarint64(&trainer_id)) { - return tag; - } - meta_.set_trainer_id(trainer_id); - break; - } - case sendrecv::VariableMessage::kTableNameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_table_name(temp); - break; - } - default: { - // Unknown tag, return unknown error. - return -1; - } - } - } - - return 0; -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h deleted file mode 100644 index 4d12b4a4bacd7..0000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class GRPCVariableResponse : public VariableResponse { - public: - GRPCVariableResponse(const framework::Scope* scope, - const platform::DeviceContext* dev_ctx, - bool create_scope = false) - : VariableResponse(scope, dev_ctx, create_scope) {} - - virtual ~GRPCVariableResponse() {} - - int Parse(Source* source) override; - - // return: - // 0:ok. - // -1: unkown error. - // other: number of error field. - int Parse(const ::grpc::ByteBuffer& byte_buffer); -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.cc b/paddle/fluid/operators/distributed/heart_beat_monitor.cc deleted file mode 100644 index 9f537f5334898..0000000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" - -#include - -namespace paddle { -namespace operators { -namespace distributed { - -DEFINE_int32(worker_update_interval_secs, 900, - " the longest time interval between the worker update variables"); - -inline int GetCurrentUS() { - // current date/time based on current system - time_t t = std::time(0); - int now = static_cast(t); - return now; -} - -void HeartBeatMonitor::Update(const int worker_id, std::string be_monitored_var, - WorkerStatus status) { - if (status == UNINITED) { - LOG(WARNING) << "HeartBeatMonitor receive UNINITED status can not be used " - "in Update, something error"; - } - - if (!is_chief_) { - return; - } - - if ((be_monitored_var == be_monitored_var_ && status == RUNNING) || - status == COMPLETED) { - auto timestamp = GetCurrentUS(); - UnderMonitoredWorker& worker = worker_status_map_.at(worker_id); - - if (worker.status != COMPLETED) { - worker.status = status; - } - worker.timestamp = timestamp; - return; - } -} - -void HeartBeatMonitor::LostWorkerMonitor() { - VLOG(1) << "worker heartbeat monitor start at No.0 parameter server"; - while (running_) { - for (int id = 0; id < workers_; ++id) { - auto& worker = worker_status_map_.at(id); - - if (worker.status == UNINITED) { - VLOG(4) << "worker " << worker.id << " is under UNINITED"; - continue; - } - if (worker.status == COMPLETED) { - VLOG(4) << "worker " << worker.id << " is under COMPLETED"; - continue; - } - - auto timestamp = GetCurrentUS(); - - VLOG(4) << "worker " << worker.id << " status is " << worker.status - << " timestamp is " << worker.timestamp << " the interval is " - << timestamp - worker.timestamp; - - if (timestamp - worker.timestamp >= FLAGS_worker_update_interval_secs) { - PADDLE_THROW(platform::errors::ExecutionTimeout( - "the latest update of worker %d is %d secs ago, we doubt the " - "the worker is not alive and this may have a bad effect on the " - "fitting result, please check", - worker.id, FLAGS_worker_update_interval_secs)); - } - } - - std::this_thread::sleep_for(std::chrono::milliseconds(10 * 1000)); - } - VLOG(1) << "worker heartbeat monitor stopped, thread exit"; -} - -std::once_flag HeartBeatMonitor::init_flag_; -std::unique_ptr HeartBeatMonitor::monitor_(nullptr); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.h b/paddle/fluid/operators/distributed/heart_beat_monitor.h deleted file mode 100644 index d96433c318b35..0000000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "gflags/gflags.h" - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -enum WorkerStatus { UNINITED = 0, RUNNING, COMPLETED }; - -struct UnderMonitoredWorker { - int id; - WorkerStatus status; - int timestamp; - - UnderMonitoredWorker() {} - - explicit UnderMonitoredWorker(int worker_id) { - this->id = worker_id; - this->status = UNINITED; - this->timestamp = 0; - } -}; - -class HeartBeatMonitor { - public: - explicit HeartBeatMonitor(int workers, bool is_chief, - std::string be_monitored_var) - : workers_(workers), - is_chief_(is_chief), - be_monitored_var_(be_monitored_var), - running_(true) { - PADDLE_ENFORCE_GT(workers, 0, platform::errors::InvalidArgument( - "workers must greater than 0.")); - - for (auto worker_id = 0; worker_id < workers; worker_id++) { - UnderMonitoredWorker worker(worker_id); - worker_status_map_[worker_id] = std::move(worker); - } - - // we define the No.0 pserver is the first parameter server - // only No.0 will check the heartbeat of all trainers - if (is_chief) { - monitor_thread_.reset(new std::thread( - std::bind(&HeartBeatMonitor::LostWorkerMonitor, this))); - } - } - - ~HeartBeatMonitor() { - running_ = false; - if (monitor_thread_) monitor_thread_->join(); - } - - static void Init(int workers, bool is_chief, std::string be_monitored_var) { - std::call_once(init_flag_, &HeartBeatMonitor::InitImpl, workers, is_chief, - be_monitored_var); - } - - static HeartBeatMonitor* GetInstance() { return monitor_.get(); } - - void Stop() { - running_ = false; - if (!monitor_) { - VLOG(0) << "HeartBeatMonitor is not inited, do nothing"; - } else { - if (monitor_thread_) { - monitor_thread_->join(); - monitor_thread_.reset(nullptr); - } - } - } - - void Update(const int worker_id, std::string be_monitored_var, - WorkerStatus status); - - void LostWorkerMonitor(); - - private: - // Init is called by GetInstance. - static void InitImpl(int workers, bool is_chief, - std::string be_monitored_var) { - if (monitor_ == nullptr) { - monitor_.reset(new HeartBeatMonitor(workers, is_chief, be_monitored_var)); - } - } - - static std::once_flag init_flag_; - static std::unique_ptr monitor_; - - int workers_; - bool is_chief_; - std::string be_monitored_var_; - std::unordered_map worker_status_map_; - std::unique_ptr monitor_thread_{nullptr}; - std::mutex mutex_; - bool running_ = false; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc deleted file mode 100644 index 8505023f63a95..0000000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" - -#include "gtest/gtest.h" - -namespace paddle { -namespace operators { -namespace distributed { - -void run(HeartBeatMonitor* monitor) { monitor->LostWorkerMonitor(); } - -TEST(HeartBeatMonitor, All) { - int trainers = 10; - int pserver_id = 0; - std::string var = "nce_w@GRAD.block0"; - std::string var2 = "nce_w@GRAD.block2"; - - HeartBeatMonitor::Init(trainers, pserver_id == 0, var); - - auto* monitor = HeartBeatMonitor::GetInstance(); - - std::vector ids{1, 3, 5, 7}; - - for (auto& id : ids) { - monitor->Update(id, var, RUNNING); - } - - monitor->Update(9, var2, RUNNING); - monitor->Update(2, var, COMPLETED); - - std::thread t(run, monitor); - t.detach(); - - std::this_thread::sleep_for(std::chrono::milliseconds(15 * 1000)); - - monitor->Stop(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h deleted file mode 100644 index da2281231fc8a..0000000000000 --- a/paddle/fluid/operators/distributed/large_scale_kv.h +++ /dev/null @@ -1,848 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "gflags/gflags.h" - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/rw_lock.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/port.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/string_helper.h" - -namespace paddle { -namespace operators { -namespace distributed { - -enum Mode { training, infer }; -enum InitType { uniform_random, fill_constant, gaussian_random }; - -inline std::vector bucket(const int v_size, const int b_size) { - int remainder = v_size % b_size; - int bucket = v_size / b_size; - std::vector ret_vec(b_size, bucket); - for (int i = 0; i < remainder; ++i) { - ret_vec[i] = ret_vec[i] + 1; - } - int cur_bucket = 0; - for (int &j : ret_vec) { - int tmp = j; - j = cur_bucket; - cur_bucket += tmp; - } - ret_vec.push_back(cur_bucket); - return ret_vec; -} - -class Initializer { - public: - Initializer() {} - - explicit Initializer(const std::vector &attrs) {} - - virtual float GetValue() = 0; - - virtual ~Initializer() {} - - protected: - std::string name_; - unsigned int seed_; -}; - -class UniformInitializer : public Initializer { - public: - explicit UniformInitializer(const std::vector &attrs) { - name_ = attrs[0]; - seed_ = static_cast(std::stoi(attrs[1])); - min_ = std::stof(attrs[2]); - max_ = std::stof(attrs[3]); - - dist_ = std::uniform_real_distribution(min_, max_); - random_engine_ = framework::GetCPURandomEngine(seed_); - } - - float GetValue() override { return dist_(*random_engine_); } - - private: - float min_; - float max_; - - std::shared_ptr random_engine_; - std::uniform_real_distribution dist_; -}; - -template -inline bool entry(const int count, const T threshold); - -template <> -inline bool entry(const int count, const std::string threshold) { - return true; -} - -template <> -inline bool entry(const int count, const int threshold) { - return count >= threshold; -} - -template <> -inline bool entry(const int count, const float threshold) { - UniformInitializer uniform = UniformInitializer({"0", "0", "1"}); - return uniform.GetValue() >= threshold; -} - -class GaussianInitializer : public Initializer { - public: - explicit GaussianInitializer(const std::vector &attrs) { - name_ = attrs[0]; - seed_ = static_cast(std::stoi(attrs[1])); - mean_ = std::stof(attrs[2]); - std_ = std::stof(attrs[3]); - - random_engine_ = framework::GetCPURandomEngine(seed_); - - dist_ = std::normal_distribution(mean_, std_); - } - - float GetValue() override { return dist_(*random_engine_); } - - private: - float std_; - float mean_; - - std::shared_ptr random_engine_; - std::normal_distribution dist_; -}; - -class FillConstantInitializer : public Initializer { - public: - explicit FillConstantInitializer(const std::vector &attrs) { - name_ = attrs[0]; - value_ = std::stof(attrs[1]); - } - - float GetValue() override { return value_; } - - private: - float value_; -}; - -struct SparseMeta { - std::string name; - std::string grad_name; - std::vector value_names; - std::vector value_dims; - std::vector cached_varnames; - std::vector initializer_attrs; - std::string entry; - Mode mode; - - std::string ToString() { - std::stringstream ss; - ss << "name: " << name << " "; - ss << "mode: " << mode << " "; - - for (int i = 0; i < static_cast(value_names.size()); i++) { - ss << "value_name: " << value_names[i] << " dim: " << value_dims[i] - << " "; - } - - ss << " grad var: " << grad_name; - - ss << " cached varnames: "; - for (int i = 0; i < static_cast(cached_varnames.size()); i++) { - ss << cached_varnames[i] << " "; - } - - ss << " initializer attrs: "; - for (int i = 0; i < static_cast(initializer_attrs.size()); i++) { - ss << initializer_attrs[i] << " "; - } - - ss << " entry attrs: " << entry; - - return ss.str(); - } -}; - -struct VALUE { - explicit VALUE(const std::vector &names) - : names_(names), count_(0), unseen_days_(0) { - values_.resize(names.size()); - for (int i = 0; i < static_cast(names.size()); i++) { - places[names[i]] = i; - } - } - - void set(std::vector> *values) { - values_ = std::move(*values); - } - - void set(const std::vector &names, - const std::vector> &values) { - for (int i = 0; i < static_cast(names.size()); i++) { - auto idx = places[names[i]]; - auto value = values[i]; - values_[idx].assign(value.begin(), value.end()); - } - } - - std::vector *> get() { - auto pts = std::vector *>(); - pts.reserve(values_.size()); - - for (auto &value : values_) { - pts.push_back(&value); - } - return pts; - } - - int fetch_count() { return ++count_; } - void reset_unseen_days() { unseen_days_ = 0; } - - void set_entry(bool is_entry) { is_entry_ = is_entry; } - - bool get_entry() { return is_entry_; } - - std::vector *> get(const std::vector names) { - auto pts = std::vector *>(); - pts.reserve(values_.size()); - - for (int i = 0; i < static_cast(names.size()); i++) { - pts.push_back(&(values_[places[names[i]]])); - } - return pts; - } - - std::vector names_; - int count_; - bool seen_after_last_save_; - int unseen_days_; - bool is_entry_; - std::vector> values_; - std::unordered_map places; -}; - -class ValueBlock { - public: - explicit ValueBlock(const std::vector value_names, - const std::vector value_dims, const Mode &mode, - const std::vector &init_attrs, - const std::string &entry_attr) - : value_names_(value_names), value_dims_(value_dims), mode_(mode) { - // for Initializer - for (size_t i = 0; i < value_names.size(); i++) { - auto name = value_names[i]; - auto slices = string::split_string(init_attrs[i], "&"); - - if (slices[0] == "gaussian_random") { - initializers_[name] = new GaussianInitializer(slices); - } else if (slices[0] == "fill_constant") { - initializers_[name] = new FillConstantInitializer(slices); - } else if (slices[0] == "uniform_random") { - initializers_[name] = new UniformInitializer(slices); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("%s can not be supported", name)); - } - } - - // for Entry - { - if (entry_attr == "none") { - entry_func_ = - std::bind(entry, std::placeholders::_1, "none"); - } else { - auto slices = string::split_string(entry_attr, "&"); - if (slices[0] == "count_filter") { - int threshold = std::stoi(slices[1]); - entry_func_ = std::bind(entry, std::placeholders::_1, threshold); - } else if (slices[0] == "probability") { - float threshold = std::stof(slices[1]); - entry_func_ = - std::bind(entry, std::placeholders::_1, threshold); - } - } - } - - rwlock_.reset(new framework::RWLock); - } - - ~ValueBlock() { - // for (auto init : initializers_) { - // delete init.second; - // initializers_.erase(init.first); - // } - // - // for (auto value : values_) { - // delete value.second; - // values_.erase(value.first); - // } - } - - void Init(const int64_t &id, std::vector> *values, - int count) { - if (Has(id)) { - PADDLE_THROW(platform::errors::AlreadyExists("id already exist, error")); - } - - if (values->size() != value_names_.size()) { - PADDLE_THROW( - platform::errors::AlreadyExists("values can not match, error")); - } - - auto value = new VALUE(value_names_); - value->set(values); - value->seen_after_last_save_ = true; - value->count_ = count; - values_[id] = value; - } - - std::vector *> Get( - const int64_t &id, const std::vector &value_names) { - rwlock_->RDLock(); - auto ret_values = values_.at(id)->get(value_names); - rwlock_->UNLock(); - return ret_values; - } - - void InitFromInitializer(const int64_t &id, - const std::vector &value_names) { - rwlock_->WRLock(); - - if (Has(id)) { - Update(id); - rwlock_->UNLock(); - return; - } - - auto rets = std::vector>(); - rets.resize(value_names_.size()); - - for (int i = 0; i < static_cast(value_names_.size()); i++) { - auto name = value_names_[i]; - auto *init = initializers_.at(name); - - auto dim = value_dims_[i]; - rets[i].resize(dim); - - for (int j = 0; j < static_cast(dim); j++) { - rets[i][j] = init->GetValue(); - } - } - - Init(id, &rets, 0); - Update(id); - rwlock_->UNLock(); - } - - bool GetEntry(const int64_t &id) { - rwlock_->RDLock(); - auto value = values_.at(id); - auto entry = value->get_entry(); - rwlock_->UNLock(); - return entry; - } - - void Set(const int64_t &id, const std::vector &value_names, - const std::vector> &values) { - rwlock_->WRLock(); - auto value = values_.at(id); - value->set(value_names, values); - rwlock_->UNLock(); - } - - void Update(const int64_t id) { - auto *value = values_.at(id); - value->reset_unseen_days(); - auto count = value->fetch_count(); - - if (!value->get_entry()) { - value->set_entry(entry_func_(count)); - } - } - - private: - bool Has(const int64_t id) { - auto got = values_.find(id); - if (got == values_.end()) { - return false; - } else { - return true; - } - } - - public: - std::unordered_map values_; - - private: - std::vector value_names_; - std::vector value_dims_; - Mode mode_; - std::function entry_func_; - std::unordered_map initializers_; - std::unique_ptr rwlock_{nullptr}; -}; - -class SparseVariable { - public: - explicit SparseVariable(const SparseMeta &meta) { - meta_.name = meta.name; - meta_.mode = meta.mode; - meta_.value_names = meta.value_names; - meta_.value_dims = meta.value_dims; - meta_.grad_name = meta.grad_name; - meta_.cached_varnames = meta.cached_varnames; - meta_.initializer_attrs = meta.initializer_attrs; - meta_.entry = meta.entry; - - for (int i = 0; i < static_cast(meta_.value_names.size()); i++) { - values_dims_[meta_.value_names[i]] = meta_.value_dims[i]; - } - - for (size_t i = 0; i < shard_num_; i++) { - auto block = std::make_shared( - meta.value_names, meta.value_dims, meta.mode, meta.initializer_attrs, - meta.entry); - shard_blocks_.emplace_back(block); - } - - rwlock_.reset(new framework::RWLock); - } - - void Init(const std::vector &ids) { - rwlock_->RDLock(); - for (auto &id : ids) { - auto *block = GetShard(id); - block->InitFromInitializer(id, meta_.value_names); - } - rwlock_->UNLock(); - } - - void Get(const std::vector &ids, - const std::vector &value_names, - std::vector *>> *values) { - values->resize(ids.size()); - - auto buckets = bucket(ids.size(), 8); - std::vector> fs; - - for (int j = 0; j < 8; ++j) { - auto begin = buckets[j]; - auto end = buckets[j + 1]; - - fs.push_back( - framework::Async([begin, end, &values, &ids, &value_names, this]() { - for (int x = begin; x < end; x++) { - auto id = ids[x]; - auto *block = GetShard(id); - auto id_values = block->Get(id, value_names); - (*values)[x] = id_values; - } - })); - } - - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } - - void GetEntry(const std::vector &ids, std::vector *values) { - auto buckets = bucket(ids.size(), 8); - std::vector> fs; - - for (int j = 0; j < 8; ++j) { - auto begin = buckets[j]; - auto end = buckets[j + 1]; - - fs.push_back(framework::Async([begin, end, &values, &ids, this]() { - for (int x = begin; x < end; x++) { - auto id = ids[x]; - auto *block = GetShard(id); - auto is_entry = block->GetEntry(id); - - if (!is_entry) { - values->push_back(id); - } - } - })); - } - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } - - void Set(const std::vector &ids, - const std::vector &value_names, - const std::vector>> &values) { - for (int i = 0; i < static_cast(ids.size()); i++) { - GetShard(ids[i])->Set(ids[i], value_names, values[i]); - } - } - - void Dims(std::vector value_names, std::vector *dims) { - for (auto &name : value_names) { - dims->push_back(values_dims_.at(name)); - } - } - - std::vector CachedVarnames() const { - return meta_.cached_varnames; - } - - void Load(const std::string &dirname) { - rwlock_->WRLock(); - VLOG(1) << "load " << meta_.name << " from dir: " << dirname << " begin"; - - std::vector filenames; - for (auto &value_name : meta_.value_names) { - auto filename = string::Sprintf("%s/%s", dirname, value_name); - filenames.push_back(filename); - } - - LoadFromSelectedRows(filenames, meta_.value_names); - VLOG(1) << "load " << meta_.name << " in dir: " << dirname << " done"; - rwlock_->UNLock(); - } - - void LoadFromSelectedRows(const std::vector &filenames, - const std::vector &valuenames) { - std::vector> variables; - auto place = platform::CPUPlace(); - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto var = std::make_shared(); - variables.push_back(var); - auto &filename = filenames[i]; - std::ifstream fin(filename, std::ios::binary); - auto *selectedRows = var->GetMutable(); - - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - framework::DeserializeFromStream(fin, selectedRows, dev_ctx); - selectedRows->SyncIndex(); - } - - std::vector tensors; - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto &slr = variables[i]->Get(); - auto src_t = slr.value(); - const auto *value = src_t.data(); - tensors.push_back(value); - } - - for (int i = 1; i < static_cast(filenames.size()); i++) { - auto rows_0 = variables[0]->Get().rows(); - auto rows_i = variables[i]->Get().rows(); - - bool is_equal = std::equal(rows_0.begin(), rows_0.end(), rows_i.begin()); - - if (!is_equal) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s and %s are not equal, can not be load rightly", filenames[0], - filenames[i])); - } - } - - auto rows = variables[0]->Get().rows(); - - for (auto i = 0; i < static_cast(rows.size()); i++) { - auto id = rows[i]; - std::vector> values; - values.resize(filenames.size()); - - for (int j = 0; j < static_cast(filenames.size()); ++j) { - values[j].resize(meta_.value_dims[j]); - std::memcpy(values[j].data(), tensors[j] + i * meta_.value_dims[j], - sizeof(float) * meta_.value_dims[j]); - } - - auto *block = GetShard(id); - block->Init(id, &values, 0); - block->Update(id); - } - } - - void Save(const std::string &dirname, const int mode = 0) { - rwlock_->WRLock(); - VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " begin"; - - MkDirRecursively(dirname.c_str()); - - std::vector filenames; - for (auto &value_name : meta_.value_names) { - auto filename = string::Sprintf("%s/%s", dirname, value_name); - filenames.push_back(filename); - } - - SaveToSelectedRows(filenames, meta_.value_names, mode); - VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " done"; - rwlock_->UNLock(); - } - - void SaveToSelectedRows(const std::vector &filenames, - const std::vector &valuenames, - const int mode) { - for (auto &value_name : valuenames) { - auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(), - value_name); - if (it == meta_.value_names.end()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "[%s] is invalid param for [%s]", value_name, meta_.name)); - } - } - - auto place = platform::CPUPlace(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - std::vector ids; - - for (auto &block : shard_blocks_) { - for (auto value : block->values_) { - if (mode == 0) { - ids.push_back(value.first); - } else { - bool id_need_save = false; - // save all params - if (mode == 1) { - id_need_save = true; - } else { - id_need_save = value.second->seen_after_last_save_; - } - - if (id_need_save) { - ids.push_back(value.first); - } - value.second->seen_after_last_save_ = false; - } - } - } - - VLOG(3) << "save " << ids.size() << " feasigns for " << meta_.name - << " with mode: " << mode; - - std::vector> variables; - std::vector tensors; - std::vector dims; - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto dim = values_dims_.at(valuenames[i]); - auto var = std::make_shared(); - auto *slr = var->GetMutable(); - auto *src_t = slr->mutable_value(); - - src_t->Resize({static_cast(ids.size()), dim}); - auto *value = src_t->mutable_data(place); - - dims.push_back(dim); - variables.push_back(var); - tensors.push_back(value); - } - - std::vector *>> values; - Get(ids, valuenames, &values); - - int64_t offset = 0; - for (auto &vss : values) { - for (int i = 0; i < static_cast(vss.size()); i++) { - auto &vs = vss[i]; - std::memcpy(tensors[i] + offset * dims[i], vs->data(), - sizeof(float) * dims[i]); - } - offset += 1; - } - - for (auto &var : variables) { - auto *slr = var->GetMutable(); - slr->set_rows(ids); - slr->set_height(ids.size()); - } - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto &filename = filenames[i]; - auto &selectedRows = variables[i]->Get(); - - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fout), true, - platform::errors::Unavailable( - "Cannot open %s to save variables.", filename)); - - framework::SerializeToStream(fout, selectedRows, dev_ctx); - fout.close(); - } - } - - void SaveToText(const std::vector &filenames, - const std::vector &valuenames) { - for (auto &value_name : valuenames) { - auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(), - value_name); - if (it == meta_.value_names.end()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "[%s] is invalid param for [%s]", value_name, meta_.name)); - } - } - - std::vector> fouts; - - for (auto filename : filenames) { - std::unique_ptr fout(new std::ofstream(filename)); - fouts.push_back(std::move(fout)); - } - - for (auto &block : shard_blocks_) { - for (auto value : block->values_) { - std::vector *> vss = value.second->get(valuenames); - - auto id = value.first; - - for (int i = 0; i < static_cast(vss.size()); i++) { - auto &vs = vss[i]; - std::stringstream ss; - ss << id << "\t"; - ss << vs->size() << "\t"; - for (auto v : (*vs)) { - ss << v << " "; - } - ss << "\n"; - - fouts[i]->write(ss.str().c_str(), sizeof(char) * ss.str().size()); - } - } - } - - for (int i = 0; i < static_cast(fouts.size()); i++) { - fouts[i]->close(); - } - } - - int64_t Size() { - int64_t cnt = 0; - - for (auto &block : shard_blocks_) { - cnt += block->values_.size(); - } - return cnt; - } - - ValueBlock *GetShard(const int64_t id) { - return shard_blocks_[id & shard_mask_].get(); - } - - SparseMeta *GetMeta() { return &meta_; } - - private: - std::unique_ptr rwlock_{nullptr}; - - SparseMeta meta_; - std::unordered_map values_dims_; - const size_t shard_mask_ = 127; - const size_t shard_num_ = 128; - std::vector> shard_blocks_; -}; - -class LargeScaleKV { - public: - LargeScaleKV() {} - - explicit LargeScaleKV(const std::vector &table_metas) { - for (auto &sparse_meta : table_metas) { - auto table_name = sparse_meta.name; - auto meta = std::shared_ptr( - new SparseVariable(std::move(sparse_meta))); - sparse_variables[table_name] = meta; - grad_to_variables[sparse_meta.grad_name] = table_name; - grad_names_.push_back(sparse_meta.grad_name); - } - } - - ~LargeScaleKV() {} - - static std::shared_ptr GetInstantcePtr() { return scale_kv_; } - - static LargeScaleKV *GetInstance() { return scale_kv_.get(); } - - static LargeScaleKV *InitInstance( - const std::vector &table_metas) { - std::call_once(init_flag_, &LargeScaleKV::Init, table_metas); - return scale_kv_.get(); - } - - static void Init(const std::vector &table_metas) { - if (scale_kv_.get() == nullptr) { - scale_kv_.reset(new LargeScaleKV(table_metas)); - } - } - - SparseVariable *Get(const std::string &name) { - auto variable = sparse_variables.at(name); - return variable.get(); - } - - bool ParamInLargeScale(const std::string &name) { - auto got = sparse_variables.find(name); - - if (got == sparse_variables.end()) { - return false; - } - - return true; - } - - bool GradInLargeScale(const std::string &name) { - auto got = grad_to_variables.find(name); - - if (got == grad_to_variables.end()) { - return false; - } - - return true; - } - - SparseVariable *GetByGrad(const std::string &name) { - return Get(grad_to_variables[name]); - } - - const std::vector &GetAllGrads() { return grad_names_; } - - private: - std::unordered_map> - sparse_variables; - std::unordered_map grad_to_variables; - std::vector grad_names_; - static std::shared_ptr scale_kv_; - static std::once_flag init_flag_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc deleted file mode 100644 index 558d70e5c3353..0000000000000 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ /dev/null @@ -1,311 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/parameter_prefetch.h" -#include -#include -#include -#include -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/distributed.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -static void SplitIdsIntoMultipleVarsBySection( - const std::vector &in_ids, - const std::vector &in_varnames, const int tables, - const int pservers, const bool is_distibuted, framework::Scope *scope, - std::vector> *splited_ids, - std::vector> *origin_ids) { - PADDLE_ENFORCE_EQ( - in_varnames.size(), tables, - platform::errors::OutOfRange( - "send varnames size: %d not equal table number: %d, internal error", - in_varnames.size(), tables)); - - PADDLE_ENFORCE_LE( - tables, pservers, - platform::errors::OutOfRange("table number %d not equal or less than " - "pserver number: %d, internal error", - tables, pservers)); - - auto place = platform::CPUPlace(); - - std::set st(in_ids.begin(), in_ids.end()); - std::vector all_ids; - all_ids.assign(st.begin(), st.end()); - - splited_ids->resize(tables); - origin_ids->resize(tables); - - if (is_distibuted) { - for (auto &id : all_ids) { - auto pserver_id = id % pservers; - (*splited_ids)[pserver_id].push_back(id); - (*origin_ids)[pserver_id].push_back(id); - } - } else { - for (auto &id : all_ids) { - auto pserver_id = id % pservers; - (*origin_ids)[pserver_id].push_back(id); - id = id / pservers; - (*splited_ids)[pserver_id].push_back(id); - } - } - - for (size_t i = 0; i < in_varnames.size(); ++i) { - auto *id_tensor = - scope->Var(in_varnames[i])->GetMutable(); - - auto &ids = (*splited_ids)[i]; - if (!ids.empty()) { - auto *id_tensor_data = id_tensor->mutable_data( - framework::make_ddim({static_cast(ids.size()), 1}), place); - memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); - } - } -} - -typedef std::vector> TableAndEndpoints; - -void prefetch_core( - const std::vector &ids, const TableAndEndpoints &tables, - const framework::ExecutionContext &context, const framework::Scope &scope, - const bool is_distributed, - std::unordered_map> *recved_vec_map) { - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance( - context.Attr("trainer_id")); - - int pservers = context.Attr("pserver_num"); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &actual_ctx = *pool.Get(platform::CPUPlace()); - - std::unique_ptr local_scope = scope.NewTmpScope(); - - std::vector in_var_names; - std::vector out_var_names; - for (size_t i = 0; i < tables.size(); ++i) { - in_var_names.push_back("prefetch_send@" + tables[i].second); - out_var_names.push_back("prefetch_recv@" + tables[i].second); - } - - std::vector> split_ids; - std::vector> origin_ids; - SplitIdsIntoMultipleVarsBySection(ids, in_var_names, tables.size(), pservers, - is_distributed, local_scope.get(), - &split_ids, &origin_ids); - - // create output var in local scope - for (auto &name : out_var_names) { - local_scope->Var(name)->GetMutable(); - } - - std::vector rets; - for (size_t i = 0; i < in_var_names.size(); i++) { - if (NeedSend(*local_scope.get(), in_var_names[i])) { - VLOG(3) << "sending " << in_var_names[i] << " to " << tables[i].second - << " to get " << out_var_names[i] << " back"; - rets.push_back(rpc_client->AsyncPrefetchVar( - tables[i].second, actual_ctx, *local_scope.get(), in_var_names[i], - out_var_names[i], tables[i].first)); - } else { - VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; - } - } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - - for (size_t o_idx = 0; o_idx < out_var_names.size(); ++o_idx) { - auto &ids_in_this_section = origin_ids[o_idx]; - - if (!ids_in_this_section.empty()) { - auto &prefetch_out_var = - local_scope->Var(out_var_names[o_idx])->Get(); - const auto *out_var_data = prefetch_out_var.data(); - auto &dims = prefetch_out_var.dims(); - - PADDLE_ENFORCE_EQ(dims.size(), 2, - platform::errors::InvalidArgument( - "The size of Tensor dims must be 2.")); - PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0], - platform::errors::InvalidArgument( - "The size of ids in this section must equal to " - "dims[0]: %s, but got %s", - dims[0], ids_in_this_section.size())); - - auto row_numel = dims[1]; - - for (int64_t i = 0; i < dims[0]; ++i) { - auto origin_id = ids_in_this_section[i]; - std::vector vecs(row_numel); - - std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin()); - (*recved_vec_map)[origin_id] = vecs; - } - } else { - VLOG(3) << "ids in this section is empty"; - } - } -} - -void prefetch(const std::string &id_name, const std::string &out_name, - const std::string &persistable_var_name, - const bool is_distributed, - const std::vector &table_names, - const std::vector &endpoints, - const framework::ExecutionContext &context, - const framework::Scope &scope) { - prefetchs({id_name}, {out_name}, persistable_var_name, is_distributed, - table_names, endpoints, context, scope); -} - -void prefetchs(const std::vector &id_var_names, - const std::vector &out_var_names, - const std::string &persistable_var_name, - const bool is_distributed, - const std::vector &table_names, - const std::vector &endpoints, - const framework::ExecutionContext &context, - const framework::Scope &scope) { - auto vec_dim_1 = 0; - auto vec_dim_0 = 0; - framework::Variable *var = scope.FindVar(persistable_var_name); - - if (var->IsType()) { - vec_dim_1 = var->Get().value().dims()[1]; - } else { - vec_dim_0 = var->Get().dims()[0]; - vec_dim_1 = var->Get().dims()[1]; - } - - PADDLE_ENFORCE_GT(vec_dim_1, 0, - platform::errors::InvalidArgument( - "lookup table var's dim must gather than 0")); - - const auto place = - scope.FindVar(id_var_names[0])->Get().place(); - - std::vector> ids_group; - std::vector ids_union; - std::vector ids_lods; - TableAndEndpoints tables; - - for (auto &id_name : id_var_names) { - auto &id_tensor = scope.FindVar(id_name)->Get(); - std::vector ids; - TensorToVector(id_tensor, context.device_context(), &ids); - ids_union.insert(ids_union.end(), ids.begin(), ids.end()); - ids_group.push_back(ids); - ids_lods.push_back(id_tensor.lod()); - } - - std::unordered_set s(ids_union.begin(), ids_union.end()); - ids_union.assign(s.begin(), s.end()); - - for (auto &i : ids_union) { - PADDLE_ENFORCE_GE( - i, 0, platform::errors::OutOfRange( - "each element in embedding should be larger or equal 0")); - if (!is_distributed) { - PADDLE_ENFORCE_LT( - i, vec_dim_0, - platform::errors::OutOfRange( - "embedding id must in [0, %d) when is_distributed False", - vec_dim_0)); - } - } - - for (size_t i = 0; i < table_names.size(); i++) { - tables.push_back(std::make_pair(table_names[i], endpoints[i])); - } - std::unordered_map> recved_vec_map; - prefetch_core(ids_union, tables, context, scope, is_distributed, - &recved_vec_map); - - auto padding_idx = distributed::kNoPadding; - - if (context.HasAttr("padding_idx")) { - padding_idx = context.Attr("padding_idx"); - } - - for (size_t i = 0; i < out_var_names.size(); i++) { - std::vector ids = ids_group[i]; - auto ids_size = ids.size(); - auto *out_t = - scope.FindVar(out_var_names[i])->GetMutable(); - out_t->set_lod(ids_lods[i]); - out_t->Resize( - framework::make_ddim({static_cast(ids_size), vec_dim_1})); - auto *out_d = out_t->mutable_data(place); - - if (platform::is_cpu_place(out_t->place())) { - for (auto idx = 0; idx < static_cast(ids_size); idx++) { - const auto &id = ids[idx]; - if (padding_idx != distributed::kNoPadding && id == padding_idx) { - memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1); - } else { - std::copy_n(recved_vec_map[id].begin(), vec_dim_1, - out_d + idx * vec_dim_1); - } - } - } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - std::vector ids_value_vec(ids_size * vec_dim_1); - for (auto idx = 0; idx < static_cast(ids_size); idx++) { - const auto &id = ids[idx]; - if (padding_idx != distributed::kNoPadding && id == padding_idx) { - memset(&ids_value_vec[idx * vec_dim_1], 0, sizeof(float) * vec_dim_1); - } else { - memcpy(&ids_value_vec[idx * vec_dim_1], &recved_vec_map[id][0], - sizeof(float) * vec_dim_1); - } - } - auto &gpu_place = BOOST_GET_CONST(platform::CUDAPlace, out_t->place()); - auto &cpu_place = BOOST_GET_CONST( - platform::CPUPlace, paddle::platform::CPUDeviceContext().GetPlace()); - auto stream = context.cuda_device_context().stream(); - memory::Copy(gpu_place, out_d, cpu_place, &ids_value_vec[0], - sizeof(float) * ids_size * vec_dim_1, stream); -#else - PADDLE_ENFORCE(true, platform::errors::PermissionDenied( - "Paddle is not compiled with GPU!")); -#endif - } - } -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h deleted file mode 100644 index 6fd3a998813c0..0000000000000 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -constexpr int64_t kNoPadding = -1; - -void prefetchs(const std::vector& id_var_names, - const std::vector& out_var_names, - const std::string& persistable_var_name, const bool backfill, - const std::vector& table_names, - const std::vector& endpoints, - const framework::ExecutionContext& context, - const framework::Scope& scope); - -void prefetch(const std::string& id_name, const std::string& out_name, - const std::string& persistable_var_name, const bool backfill, - const std::vector& table_names, - const std::vector& endpoints, - const framework::ExecutionContext& context, - const framework::Scope& scope); - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc deleted file mode 100644 index d5d3c9c3c7c48..0000000000000 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "glog/logging.h" -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/parameter_recv.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -template -void RecvSparseLodTensor(const CommContext &rpc_ctx, - const framework::Scope &scope) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto cpu_place = platform::CPUPlace(); - auto &cpu_ctx = *pool.Get(cpu_place); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::unique_ptr local_scope = scope.NewTmpScope(); - std::vector tensors; - std::vector rets; - std::vector recv_varnames; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; - local_scope->Var(recv_var_name); - // sparse param in recv_scope is LoDTensor - rets.push_back(rpc_client->AsyncGetVarNoBarrier( - rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name, - recv_var_name)); - recv_varnames.push_back(recv_var_name); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - auto &recv_var_name = recv_varnames[i]; - auto *local_var = local_scope->FindVar(recv_var_name); - const auto *value = local_var->Get().data(); - tensors.push_back(value); - } - - auto *merged_var = scope.FindVar(rpc_ctx.var_name); - - if (merged_var == nullptr || !merged_var->IsInitialized()) { - PADDLE_THROW( - platform::errors::InvalidArgument("%s must initialized at first.")); - } - auto dims1 = merged_var->Get().dims()[1]; - int64_t height = 0; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]); - height += splited_var->Get().dims()[0]; - } - - PADDLE_ENFORCE_EQ( - merged_var->Get().dims()[0], height, - platform::errors::InvalidArgument( - "Received variable must has same dimension with local variable.")); - - auto *merged_t = merged_var->GetMutable(); - auto *merged_d = merged_t->mutable_data(cpu_place); - - auto pserver_num = rpc_ctx.splited_varnames.size(); - for (int x = 0; x < height; ++x) { - auto id = x % pserver_num; - auto idx = x / pserver_num; - std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1, - sizeof(float) * dims1); - } -} - -template -void RecvGeoSparseRecords(const CommContext &rpc_ctx, - const framework::Scope &scope) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto cpu_place = platform::CPUPlace(); - auto &cpu_ctx = *pool.Get(cpu_place); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::unique_ptr local_scope = scope.NewTmpScope(); - - std::vector rets; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - local_scope->Var(recv_var_name); - VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; - // sparse param in recv_scope is LoDTensor - rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, - *local_scope.get(), recv_var_name, - recv_var_name, recv_var_name)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - - int64_t height = 0; - int64_t ids_num = 0; - int64_t width = 0; - - std::vector all_ids; - auto pserver_num = rpc_ctx.splited_varnames.size(); - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - auto *recv_var = local_scope->FindVar(recv_var_name); - auto &recv_t = recv_var->Get(); - - height += recv_t.height(); - ids_num += recv_t.rows().size(); - width = recv_t.value().dims()[1]; - - if (rpc_ctx.is_distributed) { - std::copy(recv_t.rows().begin(), recv_t.rows().end(), - std::back_inserter(all_ids)); - } else { - std::transform(recv_t.rows().begin(), recv_t.rows().end(), - std::back_inserter(all_ids), - [&](int64_t id) { return id * pserver_num + i; }); - } - } - - auto *var = scope.FindVar(rpc_ctx.var_name); - auto *t_ = var->GetMutable(); - T *out_data = - t_->mutable_value()->mutable_data({ids_num, width}, cpu_place); - t_->set_height(height); - t_->set_rows(all_ids); - - int64_t cnt = 0; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - auto *recv_var = local_scope->FindVar(recv_var_name); - auto &recv_t = recv_var->Get(); - - auto rows = recv_t.rows().size(); - const T *in_data = recv_t.value().data(); - std::copy_n(in_data, rows * width, out_data + cnt); - cnt += rows * width; - } - t_->SyncIndex(); -} - -template -void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::vector rets; - - // variable do not spilt - if (rpc_ctx.origin_varnames.size() == 1 && - rpc_ctx.splited_varnames.size() == 1) { - auto varname = rpc_ctx.origin_varnames[0]; - const auto place = - scope.FindVar(varname)->Get().place(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &ctx = *pool.Get(place); - VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? " - << platform::is_gpu_place(place); - rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx, - scope, varname, varname)); - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE( - rets[i]->Wait(), 0U, - platform::errors::ExecutionTimeout("internal error in RPCClient")); - } - - VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; - return; - } else { - PADDLE_ENFORCE(false, platform::errors::Unimplemented( - "ParameterRecv can not recv dense with multi " - "parts now, add it soon.")); - } -} - -template -void ParameterRecv::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope, - bool geo_records) { - VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name; - - PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1, - platform::errors::InvalidArgument( - "origin_varnames.size() >= 1 is permitted")); - - if (rpc_ctx.is_sparse) { - if (geo_records) { - RecvGeoSparseRecords(rpc_ctx, scope); - } else { - RecvSparseLodTensor(rpc_ctx, scope); - } - } else { - RecvLodTensor(rpc_ctx, scope); - } - - VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; -} -template -void ParameterRecv::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope) { - this->operator()(rpc_ctx, scope, false); -} - -template struct ParameterRecv; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h deleted file mode 100644 index c30d21aa791e2..0000000000000 --- a/paddle/fluid/operators/distributed/parameter_recv.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" - -namespace paddle { -namespace operators { -namespace distributed { - -template -struct ParameterRecv { - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope, - bool barrier); - - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope); -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc deleted file mode 100644 index 109514ca2541c..0000000000000 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ /dev/null @@ -1,331 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/parameter_send.h" -#include -#include -#include "glog/logging.h" -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace framework { -class Scope; -class Tensor; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -typedef std::vector> EP_SPLIT_TABLE_PAIRS; - -inline EP_SPLIT_TABLE_PAIRS GetMultiFieldCommContext( - const CommContext &rpc_ctx, const framework::Scope &scope, - int multi_parts) { - EP_SPLIT_TABLE_PAIRS table_pairs; - - auto *send_var = scope.FindVar(rpc_ctx.var_name); - if (send_var->IsType()) { - PADDLE_ENFORCE_GE(multi_parts, 1, - platform::errors::InvalidArgument( - "multi_parts must == 1 in parameter send, now is: %d", - multi_parts)); - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - table_pairs.push_back( - std::make_pair(rpc_ctx.epmap[i], rpc_ctx.splited_varnames[i])); - } - - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "GetMultiFieldCommContext unsupported LoDTensor current!")); - } - - return table_pairs; -} // namespace distributed - -void SendByNotifyRPC(const CommContext &rpc_ctx, - const framework::Scope &scope) { - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto &send_var_name = rpc_ctx.var_name; - std::vector rets; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - if (NeedSend(scope, send_var_name)) { - for (size_t j = 0; j < rpc_ctx.epmap.size(); j++) { - auto &endpoint = rpc_ctx.epmap[j]; - VLOG(4) << "sending " << send_var_name << " to " << endpoint; - rets.push_back(rpc_client->AsyncDistributeNotify(endpoint, cpu_ctx, scope, - send_var_name)); - VLOG(4) << "send var " << send_var_name << " by notify RPC done"; - } - } else { - VLOG(3) << "don't send non-initialized variable: " << rpc_ctx.var_name; - } - - for (auto &handle : rets) { - PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } -} - -template -void ParameterSend::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope, bool sync, - int multi_parts) { - if (rpc_ctx.var_name == STEP_COUNTER) { - SendByNotifyRPC(rpc_ctx, scope); - return; - } - - std::unique_ptr local_scope = scope.NewTmpScope(); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx = *pool.Get(platform::CPUPlace()); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::vector rets; - auto *send_var = scope.FindVar(rpc_ctx.var_name); - - if (send_var->IsType()) { - size_t out_num = rpc_ctx.splited_varnames.size(); - if (out_num > 1) { - auto &send_tensor = send_var->Get(); - auto &send_tensor_dims = send_tensor.dims(); - std::vector outs_dims; - outs_dims.reserve(out_num); - - // infer output shape - PADDLE_ENFORCE_EQ( - rpc_ctx.height_sections.size(), out_num, - platform::errors::InvalidArgument("tensor split sections size" - "should be equal to output size.")); - for (size_t i = 0; i < out_num; ++i) { - auto dim = send_tensor_dims; - dim[0] = rpc_ctx.height_sections[i]; - outs_dims.push_back(dim); - } - - // create output var in local scope - size_t row_offset = 0; - for (size_t i = 0; i < out_num; ++i) { - framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[i]) - ->GetMutable(); - *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); - row_offset += outs_dims[i][0]; - } - } else { - auto &send_tensor = send_var->Get(); - framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[0]) - ->GetMutable(); - out->ShareDataWith(send_tensor); - } - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &send_var_name = rpc_ctx.splited_varnames[i]; - auto &endpoint = rpc_ctx.epmap[i]; - VLOG(4) << " send var name: " << send_var_name - << "endpoint: " << endpoint; - if (NeedSend(*local_scope.get(), send_var_name)) { - VLOG(3) << "sending " << send_var_name << " to " << endpoint; - rets.push_back(rpc_client->AsyncSendVar( - endpoint, cpu_ctx, *local_scope.get(), send_var_name)); - VLOG(4) << "send var " << send_var_name << " async handle done"; - } else { - VLOG(3) << "don't send non-initialized variable: " - << rpc_ctx.splited_varnames[i]; - } - } - } else if (send_var->IsType()) { - auto &send_slr = send_var->Get(); - - auto &send_rows = send_slr.rows(); - if (send_rows.size() == 0) { - LOG(WARNING) - << "WARNING: The variable sent to pserver is empty, which " - "may cause an unknown error. Please check the state of " - "use_double_buffer in pyreader/dataloader async mode, you need to " - "turn it false."; - } - - std::vector> outs_rows_idx; - std::vector> outs_dense_idx; - - auto table_pairs = GetMultiFieldCommContext(rpc_ctx, scope, 1); - outs_rows_idx.resize(table_pairs.size()); - outs_dense_idx.resize(table_pairs.size()); - - auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0]; - auto *src = send_slr.value().data(); - - // create output var in local scope - std::vector outs; - for (auto &table : table_pairs) { - auto *out = - local_scope->Var(table.second)->GetMutable(); - outs.push_back(out); - } - - if (!rpc_ctx.is_distributed) { - auto pserver_num = rpc_ctx.epmap.size(); - - // split rows index into output sparse vars - for (size_t i = 0; i < send_rows.size(); ++i) { - auto ep_idx = send_rows[i] % pserver_num; - auto id = send_rows[i] / pserver_num; - outs_rows_idx[ep_idx].push_back(id); - outs_dense_idx[ep_idx].push_back(i); - } - - auto place = platform::CPUPlace(); - - for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size(); - out_idx++) { - auto rows_idx = outs_rows_idx[out_idx]; - - auto dims = send_slr.GetCompleteDims(); - dims[0] = rows_idx.size(); - outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]); - outs[out_idx]->mutable_rows()->clear(); - outs[out_idx]->mutable_value()->mutable_data(dims, send_slr.place()); - - if (rows_idx.size() > 0) { - for (auto idx : rows_idx) { - outs[out_idx]->mutable_rows()->push_back(idx); - } - auto dst = outs[out_idx]->mutable_value()->mutable_data(place); - for (size_t j = 0; j < rows_idx.size(); j++) { - if (platform::is_cpu_place(place)) { - memory::Copy(platform::CPUPlace(), dst + j * row_numel, - platform::CPUPlace(), - src + outs_dense_idx[out_idx][j] * row_numel, - sizeof(T) * row_numel); - } else { - PADDLE_THROW( - platform::errors::Unimplemented("do not support GPU now")); - } - } - } - PADDLE_ENFORCE_EQ( - rows_idx.size(), outs[out_idx]->rows().size(), - platform::errors::InvalidArgument( - "rows should has the same size with tensor dim 0")); - } - } else { - auto pserver_num = rpc_ctx.epmap.size(); - - // split rows index into output sparse vars - for (size_t i = 0; i < send_rows.size(); ++i) { - auto out_idx = send_rows[i] % pserver_num; - outs_rows_idx[out_idx].push_back(send_rows[i]); - outs_dense_idx[out_idx].push_back(i); - } - - auto place = platform::CPUPlace(); - - for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size(); - out_idx++) { - auto rows_idx = outs_rows_idx[out_idx]; - - auto dims = send_slr.GetCompleteDims(); - dims[0] = rows_idx.size(); - - outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]); - outs[out_idx]->mutable_rows()->clear(); - outs[out_idx]->mutable_value()->mutable_data(dims, send_slr.place()); - - if (rows_idx.size() > 0) { - for (auto idx : rows_idx) { - outs[out_idx]->mutable_rows()->push_back(idx); - } - auto dst = outs[out_idx]->mutable_value()->mutable_data(place); - for (size_t j = 0; j < rows_idx.size(); j++) { - if (platform::is_cpu_place(place)) { - memory::Copy(platform::CPUPlace(), dst + j * row_numel, - platform::CPUPlace(), - src + outs_dense_idx[out_idx][j] * row_numel, - sizeof(T) * row_numel); - } else { - PADDLE_THROW( - platform::errors::Unimplemented("do not support GPU now")); - } - } - } - PADDLE_ENFORCE_EQ( - rows_idx.size(), outs[out_idx]->rows().size(), - platform::errors::InvalidArgument( - "rows should has the same size with tensor dim 0")); - } - } - - for (size_t i = 0; i < table_pairs.size(); i++) { - auto &send_var_name = table_pairs[i].second; - auto &endpoint = table_pairs[i].first; - auto need_send = NeedSend(*local_scope.get(), send_var_name); - - VLOG(4) << "send var name: " << send_var_name - << " send var endpoint: " << endpoint - << " need send: " << need_send; - - if (need_send) { - VLOG(4) << "sending " << send_var_name << " to " << endpoint; - - rets.push_back(rpc_client->AsyncSendVar( - endpoint, cpu_ctx, *local_scope.get(), send_var_name)); - VLOG(4) << "send var " << send_var_name << " async handle done"; - } else { - VLOG(4) << "don't send non-initialized variable: " - << rpc_ctx.splited_varnames[i]; - } - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "unsupported var type: %s to send!", send_var->Type())); - } - - VLOG(4) << "Prepare to send var " << rpc_ctx.var_name; - if (sync) { - for (auto &handle : rets) { - VLOG(4) << "Wait send var to pserver handle: " << handle; - PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - } -} - -template struct ParameterSend; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h deleted file mode 100644 index cedc98b1fcadd..0000000000000 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#pragma once - -#include - -#include "grpc++/grpc++.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -char* EncodeVarint32(char* dst, uint32_t v) { - // Operate on characters as unsigneds - unsigned char* ptr = reinterpret_cast(dst); - static const int B = 128; - if (v < (1 << 7)) { - *(ptr++) = v; - } else if (v < (1 << 14)) { - *(ptr++) = v | B; - *(ptr++) = v >> 7; - } else if (v < (1 << 21)) { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = v >> 14; - } else if (v < (1 << 28)) { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = (v >> 14) | B; - *(ptr++) = v >> 21; - } else { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = (v >> 14) | B; - *(ptr++) = (v >> 21) | B; - *(ptr++) = v >> 28; - } - return reinterpret_cast(ptr); -} - -char* EncodeVarint64(char* dst, uint64_t v) { - static const int B = 128; - unsigned char* ptr = reinterpret_cast(dst); - while (v >= B) { - *(ptr++) = (v & (B - 1)) | B; - v >>= 7; - } - *(ptr++) = static_cast(v); - return reinterpret_cast(ptr); -} - -int VarintLength(uint64_t v) { - int len = 1; - while (v >= 128) { - v >>= 7; - len++; - } - return len; -} - -class ProtoEncodeHelper { - public: - ProtoEncodeHelper(char* buf, int max_size) - : base_(buf), p_(buf), limit_(base_ + max_size) {} - - ~ProtoEncodeHelper() {} - - const char* data() const { return base_; } - size_t size() const { return p_ - base_; } - - void WriteUint64(int tag, uint64_t v) { - Encode32(combine(tag, WIRETYPE_VARINT)); - Encode64(v); - } - void WriteBool(int tag, bool v) { - Encode32(combine(tag, WIRETYPE_VARINT)); - EncodeBool(v); - } - void WriteString(int tag, const std::string& v) { - Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); - Encode32(v.size()); - EncodeBytes(v.data(), v.size()); - } - void WriteVarlengthBeginning(int tag, uint32_t len) { - Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); - Encode32(len); - } - void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); } - - private: - // Note: this module's behavior must match the protocol buffer wire encoding - // format. - enum { - WIRETYPE_VARINT = 0, - WIRETYPE_LENGTH_DELIMITED = 2, - }; - static uint32_t combine(uint32_t tag, uint32_t type) { - return ((tag << 3) | type); - } - inline void Encode32(uint32_t v) { - if (v < 128) { - // Fast path for single-byte values. Many of the calls will use a - // constant value for v, so the comparison will get optimized away - // when Encode32 is inlined into the caller. - *p_ = v; - p_++; - } else { - p_ = EncodeVarint32(p_, v); - } - } - void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); } - void EncodeBool(bool v) { - *p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1 - p_++; - } - void EncodeBytes(const char* bytes, int N) { - memcpy(p_, bytes, N); - p_ += N; - } - - char* base_; - char* p_; - char* limit_; // Just for CHECKs -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h deleted file mode 100644 index 44359af1b1b2a..0000000000000 --- a/paddle/fluid/operators/distributed/request_handler.h +++ /dev/null @@ -1,261 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include // NOLINT - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/macros.h" - -namespace paddle { -namespace operators { -namespace distributed { - -constexpr char kRequestSend[] = "RequestSend"; -constexpr char kRequestGet[] = "RequestGet"; -constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable"; -constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier"; -constexpr char kRequestPrefetch[] = "RequestPrefetch"; -constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; -constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; -constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier"; -constexpr char kRequestNotify[] = "RequestNotify"; -constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv"; - -constexpr char kSendRPC[] = "SendRPC"; -constexpr char kGetRPC[] = "GetRPC"; -constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC"; -constexpr char kGetMonomerRPC[] = "GetMonomerRPC"; -constexpr char kPrefetchRPC[] = "PrefetchRPC"; -constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC"; -constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC"; -constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC"; -constexpr char kSendCompleteRPC[] = "SendCompleteRPC"; -constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC"; -constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC"; -constexpr int64_t kPrefetchTimeout = 60000; - -#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" -#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" -#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" -#define COMPLETE_MESSAGE "COMPLETE@RECV" -#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV" -#define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@" -#define STEP_COUNTER "@PS_STEP_COUNTER@" - -#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" -#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" - -enum DistributedMode { kSync = 0, kAsync = 1, kHalfAsync = 2, kGeo = 3 }; - -class RPCServer; - -class VarHandle { - public: - VarHandle(const std::string ep, const std::string& method, - const std::string& name, - const platform::DeviceContext* p_ctx = nullptr, - const framework::Scope* p_scope = nullptr) - : status_(kDefaultState) { - ep_ = ep; - ctx_ = p_ctx; - scope_ = p_scope; - name_ = name; - method_ = method; - } - - virtual ~VarHandle() {} - - public: - bool should_retry = false; - - bool Wait() { - int ret = kDefaultState; - { - std::unique_lock lk(sync_mutex_); - wait_cond_.wait(lk, [this] { return status_ != kDefaultState; }); - ret = status_; - } - VLOG(7) << "VarHandle wait:" << ret; - return ret != kErrorState; - } - - void Finish(bool ok) { - { - std::unique_lock lk(sync_mutex_); - status_ = ok ? kFinishState : kErrorState; - } - VLOG(7) << "VarHandle finish:" << ok; - wait_cond_.notify_all(); - } - - std::string String() const { - std::ostringstream s; - s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:[" - << status_ << "]"; - return s.str(); - } - - std::string ep() const { return ep_; } - const platform::DeviceContext* ctx() const { return ctx_; } - const framework::Scope* scope() const { return scope_; } - std::string name() const { return name_; } - std::string method() const { return method_; } - - protected: - // RPC endpoint. - std::string ep_; - const platform::DeviceContext* ctx_; - const framework::Scope* scope_; - // Variable name. - std::string name_; - // RPC method name. - std::string method_; - - protected: - std::mutex sync_mutex_; - std::condition_variable wait_cond_; - - enum VarHandleStatus { - kDefaultState = -1, - kErrorState = 0, - kFinishState = 1, - }; - VarHandleStatus status_; - - private: - DISABLE_COPY_AND_ASSIGN(VarHandle); -}; - -typedef std::shared_ptr VarHandlePtr; - -class RequestHandler { - public: - explicit RequestHandler(int distributed_mode) - : distributed_mode_(distributed_mode), - dev_ctx_(nullptr), - executor_(nullptr), - scope_(nullptr), - program_(nullptr), - rpc_server_(nullptr) {} - - virtual ~RequestHandler() {} - - // Set attributes. - void SetScope(framework::Scope* scope) { scope_ = scope; } - void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } - void SetProgram(framework::ProgramDesc* program) { program_ = program; } - void SetExecutor(framework::Executor* executor) { executor_ = executor; } - - // Used for dist lookup table prefetch - void SetPrefetchPreparedCtx( - std::unordered_map< - std::string, std::shared_ptr>* g) { - prefetch_var_name_to_prepared_ctx_ = g; - } - - void SetCheckpointNotifyPreparedCtx( - std::shared_ptr g) { - checkpoint_prepared_ctx_ = g; - } - - // Used for async. - void SetGradToPreparedCtx( - std::unordered_map< - std::string, std::shared_ptr>* g) { - grad_to_prepared_ctx_ = g; - } - - void SetSparseGradToParam(std::unordered_map* g) { - sparse_grad_to_param_ = g; - } - - void SetLrDecayPreparedCtx( - std::shared_ptr g) { - lr_decay_prepared_ctx_ = g; - } - - void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; } - - // Get attributes. - int distributed_mode() { return distributed_mode_; } - framework::Scope* scope() { return scope_; } - const platform::DeviceContext* dev_ctx() { return dev_ctx_; } - framework::ProgramDesc* program() { return program_; } - framework::Executor* executor() { return executor_; } - - // This function processes user's rpc request. - // The implemention is in request_handler_impl. - // example: - // std::string varname = request_.varname(); - // - // auto scope = request_handler_->scope(); - // auto invar = scope->FindVar(varname); - // framework::Variable* outvar = nullptr; - // - // request_handler_->Handle(varname, scope, invar, &outvar); - // if (outvar) { - // SerializeToByteBuffer(varname, outvar, - // *request_handler_->dev_ctx(), &reply_); - // } - virtual bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "", - const std::string& table_name = "") = 0; - - protected: - const int distributed_mode_; - - const platform::DeviceContext* dev_ctx_; - framework::Executor* executor_; - framework::Scope* scope_; - framework::ProgramDesc* program_; - - // used for distribute lookup table prefetch - std::unordered_map>* - prefetch_var_name_to_prepared_ctx_; - // used for checkpoint notify - std::shared_ptr checkpoint_prepared_ctx_; - - // Used for async. - std::unordered_map>* - grad_to_prepared_ctx_; - std::unordered_map* sparse_grad_to_param_; - - // used for lr decay - std::shared_ptr lr_decay_prepared_ctx_; - RPCServer* rpc_server_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc deleted file mode 100644 index 8c4f2ef57a32c..0000000000000 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/string/piece.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/split.h" - -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" - -namespace paddle { -namespace operators { -namespace distributed { - -// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables -// to directory specified. -constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; - -bool RequestSendHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestSendHandler:" << varname; - - // Sync - if (varname == BATCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; - rpc_server_->IncreaseBatchBarrier(kRequestSend); - } else if (varname == COMPLETE_MESSAGE) { - VLOG(3) << "sync: recv complete message"; - - if (HeartBeatMonitor::GetInstance() != nullptr) { - HeartBeatMonitor::GetInstance()->Update(trainer_id, "", COMPLETED); - } - - rpc_server_->Complete(); - } else { - // Async - if (distributed_mode_ != DistributedMode::kSync) { - VLOG(3) << "async process var: " << varname; - if (varname == BATCH_BARRIER_MESSAGE) { - PADDLE_THROW(platform::errors::InvalidArgument( - "async mode should not recv BATCH_BARRIER_MESSAGE or " - "COMPLETE_MESSAGE")); - } - HeartBeatMonitor::GetInstance()->Update(trainer_id, varname, RUNNING); - - std::string run_varname = varname; - - string::Piece part_piece("@PIECE"); - string::Piece var_name_piece = string::Piece(varname); - - if (string::Contains(var_name_piece, part_piece)) { - auto varname_splits = paddle::string::Split(varname, '@'); - PADDLE_ENFORCE_EQ( - varname_splits.size(), 3, - platform::errors::InvalidArgument( - "varname: %s should be separated into 3 parts by @", varname)); - run_varname = varname_splits[0]; - scope->Rename(varname, run_varname); - } - - auto *var = scope->FindVar(run_varname); - - // for sparse ids - if (var->IsType()) { - if (distributed_mode_ == DistributedMode::kAsync || - distributed_mode_ == DistributedMode::kHalfAsync) { - auto *ins = distributed::LargeScaleKV::GetInstance(); - if (ins->GradInLargeScale(run_varname)) { - auto *large_scale_var = ins->GetByGrad(run_varname); - - for (auto name : large_scale_var->CachedVarnames()) { - scope->Var(name); - } - } - } - if (distributed_mode_ == DistributedMode::kGeo) { - if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad( - run_varname)) { - auto &grad_slr = - scope->FindVar(run_varname)->Get(); - AsyncSparseParamUpdateRecorder::GetInstance()->Update( - run_varname, grad_slr.rows()); - } - } - } - - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[run_varname].get(), - scope); - return true; - } else { // sync - rpc_server_->WaitCond(kRequestSend); - VLOG(3) << "sync: processing received var: " << varname; - PADDLE_ENFORCE_NOT_NULL( - invar, platform::errors::NotFound( - "sync: Can not find server side var %s.", varname)); - } - } - return true; -} - -bool RequestGetHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "RequestGetHandler:" << varname - << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id - << " table_name: " << table_name; - - if (distributed_mode_ == DistributedMode::kSync) { - if (varname == FETCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv fetch barrier message"; - rpc_server_->IncreaseBatchBarrier(kRequestGet); - } else { - rpc_server_->WaitCond(kRequestGet); - *outvar = scope_->FindVar(varname); - } - } else { - if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) { - if (enable_dc_asgd_) { - // NOTE: the format is determined by distribute_transpiler.py - std::string param_bak_name = - string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); - VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; - auto var = scope_->FindVar(varname); - auto t_orig = var->Get(); - auto param_bak = scope_->Var(param_bak_name); - auto t = param_bak->GetMutable(); - t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); - VLOG(3) << "copying " << varname << " to " << param_bak_name; - framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); - } - - if (distributed_mode_ == DistributedMode::kGeo && - AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) && - !table_name.empty()) { - VLOG(3) << "AsyncSparseParamUpdateRecorder " << varname << " exist "; - - std::vector updated_rows; - AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear( - varname, trainer_id, &updated_rows); - - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto &row_id : updated_rows) { - sstream << row_id << ", "; - } - sstream << "]"; - VLOG(3) << "updated_rows size: " << updated_rows.size() << " " - << sstream.str(); - } - - auto &origin_tensor = - scope_->FindVar(varname)->Get(); - auto *origin_tensor_data = origin_tensor.data(); - auto &dims = origin_tensor.dims(); - *outvar = scope->Var(); - auto *out_slr = (*outvar)->GetMutable(); - out_slr->set_rows(updated_rows); - out_slr->set_height(dims[0]); - auto out_dims = framework::make_ddim( - {static_cast(updated_rows.size()), dims[1]}); - auto *data = out_slr->mutable_value()->mutable_data( - out_dims, origin_tensor.place()); - auto width = dims[1]; - for (size_t i = 0; i < updated_rows.size(); ++i) { - PADDLE_ENFORCE_LT( - updated_rows[i], dims[0], - platform::errors::OutOfRange( - "The value of updated_rows: %s out of Tensor %s dims[0]: %s", - updated_rows[i], varname, dims[0])); - memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width, - sizeof(float) * width); - } - } else { - *outvar = scope_->FindVar(varname); - } - } - } - return true; -} - -bool RequestGetNoBarrierHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestGetNoBarrierHandler:" << varname - << " out_var_name: " << out_var_name; - - // get var from pserver immediately without barriers - string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE); - string::Piece var_name_piece = string::Piece(varname); - - if (string::Contains(var_name_piece, without_barrier_piece)) { - var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece); - VLOG(4) << "Get var " << var_name_piece << " with " - << WITHOUT_BARRIER_MESSAGE; - *outvar = scope_->FindVar(var_name_piece.ToString()); - return true; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE)); - } - return true; -} - -bool RequestPrefetchHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestPrefetchHandler " << varname; - - (*outvar)->GetMutable(); - - VLOG(1) << "Prefetch " - << "tablename: " << table_name << " ids:" << varname - << " out: " << out_var_name; - paddle::platform::CPUPlace cpu_place; - auto *ins = distributed::LargeScaleKV::GetInstance(); - - if (ins->ParamInLargeScale(table_name)) { - auto lookup_table_op = PullLargeScaleOp(table_name, varname, out_var_name); - lookup_table_op->Run(*scope, cpu_place); - } else { - auto lookup_table_op = - BuildLookupTableOp(table_name, varname, out_var_name); - lookup_table_op->Run(*scope, cpu_place); - } - - return true; -} - -bool RequestCheckpointHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "receive save var " << varname << " with path " << out_var_name - << " mode " << table_name; - - int mode = std::stoi(table_name); - - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Save(out_var_name, mode); - return true; -} - -bool RequestNotifyHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "RequestNotifyHandler: " << varname - << ", trainer_id: " << trainer_id; - - string::Piece decay_piece(STEP_COUNTER); - string::Piece var_name_piece = string::Piece(varname); - if (string::Contains(var_name_piece, decay_piece)) { - VLOG(3) << "LearningRate Decay Counter Update"; - - auto *send_var = scope->FindVar(varname); - auto send_var_tensor = send_var->Get(); - auto *send_value = - send_var_tensor.mutable_data(send_var_tensor.place()); - - auto counter = decay_counters.at(trainer_id); - counter += send_value[0]; - decay_counters.at(trainer_id) = counter; - - auto *global_step_var = this->scope()->FindVar(LEARNING_RATE_DECAY_COUNTER); - if (global_step_var == nullptr) { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not find LEARNING_RATE_DECAY_COUNTER ")); - } - - auto *tensor = global_step_var->GetMutable(); - auto *value = tensor->mutable_data(platform::CPUPlace()); - - auto global_counter = 0; - for (auto &trainer_counter : decay_counters) { - global_counter += trainer_counter.second; - } - value[0] = global_counter; - - if (lr_decay_prepared_ctx_.get() == nullptr) { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not find decay block for executor")); - } - - executor_->RunPreparedContext(lr_decay_prepared_ctx_.get(), scope_); - } - return true; -} - -bool RequestSendAndRecvHandler::Handle(const std::string &varname, - framework::Scope *Scope, - framework::Variable *var, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "SendAndRecvHandle: " << varname - << " out_var_name: " << out_var_name - << " , trainer_id: " << trainer_id; - - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope); - *outvar = Scope->FindVar(out_var_name); - return true; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h deleted file mode 100644 index 6d239673f9104..0000000000000 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestSendHandler final : public RequestHandler { - public: - explicit RequestSendHandler(int distributed_mode, bool enable_dc_asgd = false) - : RequestHandler(distributed_mode) { - enable_dc_asgd_ = enable_dc_asgd; - } - virtual ~RequestSendHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - bool enable_dc_asgd_; -}; - -class RequestGetHandler final : public RequestHandler { - public: - explicit RequestGetHandler(int distributed_mode, bool enable_dc_asgd = false) - : RequestHandler(distributed_mode) { - enable_dc_asgd_ = enable_dc_asgd; - } - virtual ~RequestGetHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - bool enable_dc_asgd_; -}; - -class RequestGetNoBarrierHandler final : public RequestHandler { - public: - RequestGetNoBarrierHandler() : RequestHandler(false) {} - virtual ~RequestGetNoBarrierHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; -}; - -static inline void BuildVar(const std::string& param_name, - std::initializer_list arguments, - paddle::framework::proto::OpDesc::Var* var) { - var->set_parameter(param_name); - for (auto& arg_name : arguments) { - *var->mutable_arguments()->Add() = arg_name; - } -} - -class RequestPrefetchHandler final : public RequestHandler { - public: - explicit RequestPrefetchHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - virtual ~RequestPrefetchHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - std::unique_ptr PullLargeScaleOp( - const std::string& table_name, const std::string& id_name, - const std::string& out_name) { - framework::OpDesc desc; - desc.SetType("lookup_sparse_table_read"); - desc.SetInput("Ids", {id_name}); - desc.SetOutput("Out", std::vector({out_name})); - desc.SetAttr("tablename", {table_name}); - desc.SetAttr("init", true); - desc.SetAttr("value_names", std::vector({"Param"})); - - auto op = paddle::framework::OpRegistry::CreateOp(desc); - return op; - } - - std::unique_ptr BuildLookupTableOp( - const std::string& table_name, const std::string& id_name, - const std::string& out_name) { - paddle::framework::proto::OpDesc op_desc; - op_desc.set_type("lookup_table"); - BuildVar("W", {table_name.data()}, op_desc.add_inputs()); - BuildVar("Ids", {id_name.data()}, op_desc.add_inputs()); - BuildVar("Out", {out_name.data()}, op_desc.add_outputs()); - - auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - return op; - } -}; - -class RequestCheckpointHandler final : public RequestHandler { - public: - explicit RequestCheckpointHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - - virtual ~RequestCheckpointHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - std::unique_ptr BuildCheckpointOp( - const std::string& varname, const std::string& file_path) { - paddle::framework::proto::OpDesc op_desc; - op_desc.set_type("save"); - BuildVar("X", {varname.data()}, op_desc.add_inputs()); - - auto attr = op_desc.mutable_attrs()->Add(); - attr->set_name("file_path"); - attr->set_type(paddle::framework::proto::AttrType::STRING); - attr->set_s(file_path); - - auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - return op; - } -}; - -class RequestNotifyHandler final : public RequestHandler { - public: - explicit RequestNotifyHandler(int distributed_mode, int trainers) - : RequestHandler(distributed_mode) { - this->trainers = trainers; - for (int i = 0; i < trainers; i++) { - decay_counters[i] = 0; - } - } - virtual ~RequestNotifyHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - int trainers; - std::unordered_map decay_counters; -}; - -class RequestSendAndRecvHandler final : public RequestHandler { - public: - explicit RequestSendAndRecvHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - virtual ~RequestSendAndRecvHandler() {} - bool Handle(const std::string& varname, framework::Scope* Scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc deleted file mode 100644 index 57ce54870decf..0000000000000 --- a/paddle/fluid/operators/distributed/rpc_client.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "gflags/gflags.h" - -// default to 3min to avoid temprary network failures. -DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc"); -DEFINE_int32(rpc_retry_times, 3, "retry times for rpc"); - -namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag RPCClient::init_flag_; -std::unique_ptr RPCClient::rpc_client_(nullptr); -int RPCClient::trainer_id_ = 0; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h deleted file mode 100644 index 2c756a6f71ff9..0000000000000 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -DECLARE_int32(rpc_deadline); -DECLARE_int32(rpc_retry_times); - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient { - public: - RPCClient() {} - virtual ~RPCClient() {} - virtual VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncPrefetchVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& in_var_name, - const std::string& out_var_name, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendAndRecv( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& send_var_name, - const std::string& recv_var_name, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendComplete( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - // Complete tells all the pserver instances that finishe the training, - // the pserver can reduce it's barrier count, and continue to train - // with other trainers. - virtual void SendComplete() = 0; - - virtual bool Wait() = 0; - - template - static RPCClient* GetInstance(int trainer_id) { - std::call_once(init_flag_, &RPCClient::Init, trainer_id); - return rpc_client_.get(); - } - - // Init is called by GetInstance. - template - static void Init(int trainer_id) { - VLOG(1) << "init rpc client with trainer_id " << trainer_id; - trainer_id_ = trainer_id; - if (rpc_client_.get() == nullptr) { - rpc_client_.reset(new T()); - rpc_client_->InitImpl(); - } - } - - virtual void InitImpl() {} - - protected: - // each trainer have exact one trainer id, it should be static - static int trainer_id_; - - private: - static std::once_flag init_flag_; - static std::unique_ptr rpc_client_; -}; -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc deleted file mode 100644 index 37cf0460fb1fa..0000000000000 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/rpc_server.h" - -#include -#include - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestHandler; - -void RPCServer::ShutDown() { - VLOG(3) << "RPCServer ShutDown "; - ShutDownImpl(); - - exit_flag_ = true; - barrier_cond_.notify_all(); - rpc_cond_.notify_all(); -} - -void RPCServer::SavePort() const { - auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid()); - std::ofstream port_file; - port_file.open(file_path); - port_file << selected_port_; - port_file.close(); - VLOG(3) << "selected port written to " << file_path; -} - -void RPCServer::WaitBarrier(const std::string& rpc_name) { - VLOG(3) << "WaitBarrier in: " << rpc_name; - std::unique_lock lock(this->mutex_); - barrier_cond_.wait(lock, [this, &rpc_name] { - return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || - exit_flag_.load()); - }); - - VLOG(3) << "WaitBarrier out: " << rpc_name - << " counter: " << barrier_counter_[rpc_name]; -} - -void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { - VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; - // barrier msg should make sure that it's in the right cond(send|recv) - WaitCond(rpc_name); - int b = 0; - std::unique_lock lock(mutex_); - b = ++barrier_counter_[rpc_name]; - VLOG(3) << rpc_name << " barrier_counter: " << b; - if (b >= client_num_) { - lock.unlock(); - VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for " - << rpc_name; - barrier_cond_.notify_all(); - lock.lock(); - } -} - -void RPCServer::Complete() { - { - std::unique_lock lock(mutex_); - client_num_--; - need_reset_all_vars_ = true; - - VLOG(3) << "decrease client_num to: " << client_num_; - if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { - barrier_counter_[kRequestGet]--; - } - } - barrier_cond_.notify_all(); -} - -bool RPCServer::NeedResetAllVars() { - std::unique_lock lock(mutex_); - return need_reset_all_vars_; -} - -int RPCServer::GetClientNum() { - std::unique_lock lock(mutex_); - return client_num_; -} - -void RPCServer::ResetBarrierCounter() { - VLOG(3) << "RPCServer ResetBarrierCounter "; - std::unique_lock lock(mutex_); - for (auto& t : barrier_counter_) { - t.second = 0; - } - need_reset_all_vars_ = false; -} - -void RPCServer::RegisterRPC(const std::string& rpc_name, - RequestHandler* handler, int thread_num) { - rpc_call_map_[rpc_name] = handler; - rpc_thread_num_[rpc_name] = thread_num; - - static int cond = -1; - rpc_cond_map_[rpc_name] = ++cond; - VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler - << ", cond: " << rpc_cond_map_[rpc_name]; -} - -void RPCServer::SetCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer SetCond " << rpc_name; - { - std::unique_lock lock(mutex_); - cur_cond_ = rpc_cond_map_[rpc_name]; - } - - rpc_cond_.notify_all(); -} - -void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer WaitCond in " << rpc_name; - int cond = 0; - { - std::unique_lock lock(mutex_); - cond = rpc_cond_map_[rpc_name]; - } - - std::unique_lock lock(mutex_); - rpc_cond_.wait( - lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); - VLOG(3) << "RPCServer WaitCond out " << rpc_name; -} - -void RPCServer::RegisterVar(const std::string& var_name, - const std::string& rpc_name, - framework::Scope* scope, - platform::DeviceContext* dev_ctx) { - MonomerHandle h; - h.var_name_ = var_name; - h.rpc_name_ = rpc_name; - h.scope_ = scope; - h.dev_ctx_ = dev_ctx; - - { - std::unique_lock lock(mutex_); - PADDLE_ENFORCE_EQ( - var_map_.find(var_name), var_map_.end(), - platform::errors::AlreadyExists("%s already in var_map.", var_name)); - var_map_[var_name] = h; - } - - rpc_cond_.notify_all(); - VLOG(3) << "RegisterVar context:" << h.String(); -} - -void RPCServer::IncreaseVarBarrier(const std::string& var_name) { - int b = 0; - MonomerHandle h; - { - std::unique_lock lock(mutex_); - b = ++var_map_[var_name].barrier_; - h = var_map_[var_name]; - } - - if (b >= client_num_) { - barrier_cond_.notify_all(); - } - - VLOG(3) << "IncreaseVarBarrier context:" << h.String(); -} - -void RPCServer::WaitVarBarrier(const std::string& var_name) { - VLOG(3) << "WaitVarBarrier var_name:" << var_name; - - std::unique_lock lock(mutex_); - barrier_cond_.wait(lock, [&]() { - return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) || - exit_flag_.load()); - }); - - VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String(); -} - -void RPCServer::SetVarCond(const std::string& var_name) { - VLOG(3) << "SetVarCond var_name:" << var_name; - { - std::unique_lock lock(mutex_); - if (var_map_.find(var_name) != var_map_.end()) { - rpc_cond_.notify_all(); - } - } -} - -void RPCServer::WaitVarCond(const std::string& var_name) { - VLOG(3) << "WaitVarCond var_name:" << var_name; - - std::unique_lock lock(mutex_); - rpc_cond_.wait(lock, [=] { - return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); - }); - - VLOG(3) << "WaitVarCond var_name:" << var_name << " end"; -} - -MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { - MonomerHandle h; - { - std::unique_lock lock(mutex_); - h = var_map_[var_name]; - } - - return h; -} - -void RPCServer::ClearRegisteredVars() { - std::unique_lock lock(mutex_); - var_map_.clear(); -} - -void RPCServer::ClearVar(const std::string& var_name) { - std::unique_lock lock(mutex_); - var_map_.erase(var_name); -} -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h deleted file mode 100644 index 2120260515e25..0000000000000 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include // NOLINT -#include -#include -#include - -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestHandler; - -struct MonomerHandle { - std::string var_name_; - std::string rpc_name_; - framework::Scope* scope_{nullptr}; - platform::DeviceContext* dev_ctx_{nullptr}; - int64_t barrier_{0}; - - std::string String() { - std::stringstream ss; - ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_ - << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_ - << ", barrier_:" << barrier_; - return ss.str(); - } -}; - -class RPCServer { - public: - explicit RPCServer(const std::string& address, int client_num) - : cur_cond_(0), - bind_address_(address), - exit_flag_(false), - selected_port_(0), - client_num_(client_num), - need_reset_all_vars_(false) {} - - virtual ~RPCServer() {} - virtual void StartServer() = 0; - virtual void WaitServerReady() = 0; - - void ShutDown(); - - bool IsExit() { return exit_flag_.load(); } - - int GetSelectedPort() const { return selected_port_; } - - int GetClientNum(); - - void SavePort() const; - - // RegisterRPC, register the rpc method name to a handler - // class, and auto generate a condition id for this call - // to be used for the barrier. - void RegisterRPC(const std::string& rpc_name, RequestHandler* handler, - int thread_num = 1); - - int GetThreadNum(const std::string& rpc_name) { - return rpc_thread_num_[rpc_name]; - } - - // Wait util all the clients have reached the barrier for one - // rpc method. This function should be called in the - // RequestHandler if you want to run the server/client in a - // synchronous mode. - void WaitBarrier(const std::string& rpc_name); - - void SetCond(const std::string& rpc_name); - void WaitCond(const std::string& rpc_name); - void IncreaseBatchBarrier(const std::string rpc_name); - - void RegisterVar(const std::string& var_name, const std::string& rpc_name, - framework::Scope* scope, platform::DeviceContext* dev_ctx); - void IncreaseVarBarrier(const std::string& var_name); - void WaitVarBarrier(const std::string& var_name); - void SetVarCond(const std::string& var_name); - void WaitVarCond(const std::string& var_name); - void ClearRegisteredVars(); - void ClearVar(const std::string& var_name); - MonomerHandle GetMonomer(const std::string& var_name); - - void Complete(); - - void ResetBarrierCounter(); - - bool NeedResetAllVars(); - - protected: - virtual void ShutDownImpl() = 0; - - private: - std::mutex mutex_; - std::unordered_map barrier_counter_; - std::condition_variable barrier_cond_; - - std::unordered_map rpc_cond_map_; - std::atomic cur_cond_; - std::condition_variable rpc_cond_; - - protected: - std::string bind_address_; - std::atomic exit_flag_; - int selected_port_; - int client_num_; - bool need_reset_all_vars_; - - std::unordered_map rpc_call_map_; - std::unordered_map rpc_thread_num_; - friend class RequestHandler; - - // TODO(gongwb): use more cond to notify or wait; - std::unordered_map var_map_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc deleted file mode 100644 index f59285400033d..0000000000000 --- a/paddle/fluid/operators/distributed/rpc_server_test.cc +++ /dev/null @@ -1,344 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace distributed = paddle::operators::distributed; - -USE_NO_KERNEL_OP(lookup_sparse_table_read); -USE_NO_KERNEL_OP(checkpoint_notify); -USE_OP(scale); - -std::unique_ptr g_rpc_service; -std::unique_ptr g_req_handler; - -framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { - auto root_block = program->MutableBlock(0); - auto* block = program->AppendBlock(*root_block); - - framework::OpDesc* op = block->AppendOp(); - op->SetType("scale"); - op->SetInput("X", {"x"}); - op->SetOutput("Out", {"res"}); - op->SetAttr("scale", 0.5f); - - auto& out = *root_block->Var("res"); - out.SetType(framework::proto::VarType::LOD_TENSOR); - out.SetShape({1, 10}); - - return block; -} - -void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { - auto w_var = scope->Var("w"); - w_var->GetMutable(); - - auto out_var = scope->Var("out"); - out_var->GetMutable(); - - auto ids_var = scope->Var("ids"); - ids_var->GetMutable(); - - auto x_var = scope->Var("x"); - x_var->GetMutable(); - - auto res_var = scope->Var("res"); - res_var->GetMutable(); -} - -void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, - int64_t rows_numel) { - CreateVarsOnScope(scope, place); - auto ids_var = scope->Var("ids")->GetMutable(); - int64_t* ids_ptr = - ids_var->mutable_data(framework::DDim({rows_numel, 1}), *place); - for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; - - auto x_var = scope->Var("x")->GetMutable(); - float* x_ptr = - x_var->mutable_data(framework::DDim({1, rows_numel}), *place); - for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0; -} - -void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, - int64_t rows_numel) { - CreateVarsOnScope(scope, place); - auto w = scope->Var("w")->GetMutable(); - auto w_value = w->mutable_value(); - w_value->Resize({rows_numel, 10}); - for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); - - auto ptr = w_value->mutable_data(*place); - - for (int64_t i = 0; i < w_value->numel(); ++i) { - ptr[i] = static_cast(i / 10); - } -} - -void StartServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - - std::unordered_map> - prefetch_var_name_to_prepared; - - g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - - // distributed::HeartBeatMonitor::Init(1, true, "w@grad"); - - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -void StartSendAndRecvServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - auto block = AppendSendAndRecvBlock(&program); - std::string in_var_name("x"); - std::vector prefetch_block_ids{block->ID()}; - auto prepared = exe.Prepare(program, prefetch_block_ids); - InitTensorsOnServer(&scope, &place, 10); - - std::unordered_map> - grad_to_prepared_ctx; - grad_to_prepared_ctx[in_var_name] = prepared[0]; - - g_req_handler->SetProgram(&program); - g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -TEST(COMPLETE, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - g_req_handler.reset( - new distributed::RequestSendHandler(distributed::DistributedMode::kSync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2)); - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - std::thread server_thread(StartServer, distributed::kRequestSend); - g_rpc_service->WaitServerReady(); - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - client->AsyncSendComplete(ep); - client->Wait(); - - EXPECT_EQ(g_rpc_service->GetClientNum(), 1); - - g_rpc_service->ShutDown(); - server_thread.join(); - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} - -TEST(SENDANDRECV, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - g_req_handler.reset(new distributed::RequestSendAndRecvHandler( - distributed::DistributedMode::kAsync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - std::thread server_thread(StartSendAndRecvServer, - distributed::kRequestSendAndRecv); - g_rpc_service->WaitServerReady(); - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - - framework::Scope scope; - platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); - - // create var on local scope - int64_t rows_numel = 10; - InitTensorsOnClient(&scope, &place, rows_numel); - std::string in_var_name("x"); - std::string out_var_name("res"); - - client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name); - client->Wait(); - auto var = scope.Var(out_var_name); - auto value = var->GetMutable(); - auto ptr = value->mutable_data(place); - - for (int64_t i = 0; i < rows_numel; ++i) { - EXPECT_EQ(ptr[i], 0.5); - } - g_rpc_service->ShutDown(); - server_thread.join(); - LOG(INFO) << "begin reset"; - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} - -void StartCheckpointServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - - std::vector metas; - - auto meta = distributed::SparseMeta(); - meta.name = "embedding.block0"; - meta.value_names = {"Param"}; - meta.value_dims = {64}; - meta.mode = distributed::Mode::training; - meta.grad_name = "embedding@Grad"; - meta.cached_varnames = {"kSparseIds"}; - meta.initializer_attrs = {"fill_constant&1.0"}; - meta.entry = "none"; - - metas.push_back(meta); - distributed::LargeScaleKV::Init(metas); - - auto* ins = distributed::LargeScaleKV::GetInstance(); - ins->Get("embedding.block0")->Init({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); - - std::unordered_map> - prefetch_var_name_to_prepared; - - g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -TEST(LARGE_SCALE_CHECKPOINT, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - - paddle::framework::Scope scope; - paddle::platform::CPUPlace place; - - g_req_handler.reset(new distributed::RequestCheckpointHandler( - distributed::DistributedMode::kAsync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); - - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - - std::thread server_thread(StartCheckpointServer, - distributed::kRequestCheckpoint); - g_rpc_service->WaitServerReady(); - - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - - auto save_path = - paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/base", - "embedding", "embedding.block0"); - int mode = 0; - client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode); - client->Wait(); - - save_path = - paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/delta", - "embedding", "embedding.block0"); - mode = 1; - client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode); - client->Wait(); - - paddle::framework::AttributeMap attrs; - - std::vector eps = {ep}; - attrs["endpoints"] = eps; - attrs["dirname"] = std::string("/tmp/large_scale_table/delta1"); - attrs["varname"] = std::string("embedding"); - attrs["mode"] = 2; - std::vector slices = {"embedding.block0"}; - attrs["slice_varnames"] = slices; - std::vector remotes = {"embedding.block0"}; - attrs["remote_varnames"] = remotes; - - auto ops = - framework::OpRegistry::CreateOp("checkpoint_notify", {}, {}, attrs, true); - ops->Run(scope, place); - - g_rpc_service->ShutDown(); - server_thread.join(); - LOG(INFO) << "begin reset"; - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in deleted file mode 100644 index a333642bd16fb..0000000000000 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under -the Apache License, Version 2.0 (the "License"); you may not use this file -except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -syntax = "proto3"; -package sendrecv; - -option cc_generic_services = @cc_generic_services@; - -service SendRecvService { - // For parameter server round-robin like hashing, do not split tensors. - // Send and recv only one tensor - // TODO(typhoonzero): add streaming API - rpc SendVariable(VariableMessage) returns (VoidMessage) {} - // Argument VariableMessage for GetVariable should only contain varname. - rpc GetVariable(VariableMessage) returns (VariableMessage) {} - rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {} - // pre-fetch variable by given variable name and Ids - rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} - - rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} - rpc DistributeNotify(VariableMessage) returns (VoidMessage) {} - rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {} - rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {} - rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} -} - -// It can be: LoDTensor、SelectedRows or NCCL_ID -enum VarType { - LOD_TENSOR = 0; - SELECTED_ROWS = 1; - NCCL_ID = 2; -} - -// VariableMessage is serialized paddle variable message. -// NOTICE(gongwb):don't modify this proto if you are not -// not familar with how we serialize in sendrecvop_utils.h -// and deserilize it in variable_response.h. -message VariableMessage { - enum Type { - // Pod Types - BOOL = 0; - INT16 = 1; - INT32 = 2; - INT64 = 3; - FP16 = 4; - FP32 = 5; - FP64 = 6; - } - - message LodData { repeated int64 lod_data = 1; } - string varname = 1; - // TODO(Yancey1989): reference framework::proto::VarDesc::VarType - VarType type = 2; - // bool persistable is not needed for sending. - // tensor info: - Type data_type = 3; - repeated int64 dims = 4; - - // lod details: - int64 lod_level = 5; - repeated LodData lod = 6; - // selected_rows height, aka. original dim0 - int64 slr_height = 7; - // tensor data - bytes serialized = 8; - // selected_rows data - bytes rows = 9; - // Look up table block execution output variable name. - string out_varname = 10; - // If 1, the ps server will start profiling, the ps - // server stops profiling and generates a profile to /tmp/profile_ps_* - // when profile switches from 1 to 2. - int64 profile = 11; - int64 trainer_id = 12; - string table_name = 13; -} - -message VoidMessage {} diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc deleted file mode 100644 index 107c74eb2670e..0000000000000 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include - -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -} // namespace paddle - -DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not."); -DEFINE_int32(rpc_retry_bind_port, 3, - "Retry to bind the address if address is already used."); - -namespace paddle { -namespace operators { -namespace distributed { - -using VarMsg = sendrecv::VariableMessage; - -static TensorPayload GetCommunicationAllocationFromTensor( - const platform::DeviceContext& ctx, const framework::Tensor& tensor) { - if (is_gpu_place(ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_EQ( - is_gpu_place(tensor.place()), true, - platform::errors::PreconditionNotMet("Please run in gpu place.")); - auto& gpu_dev_ctx = - reinterpret_cast(ctx); - auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type()); - platform::CUDAPinnedPlace cuda_pinned; - auto result = memory::AllocShared(cuda_pinned, copy_size); - - memory::Copy(cuda_pinned, result->ptr(), - BOOST_GET_CONST(platform::CUDAPlace, tensor.place()), - tensor.data(), copy_size, gpu_dev_ctx.stream()); - ctx.Wait(); - return TensorPayload(result); -#else - PADDLE_THROW( - platform::errors::Unavailable("This situation should not be happened")); -#endif - } else { - return TensorPayload(tensor); - } -} -TensorPayload GetTensorPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request) { - auto tensor = var->Get(); - // FIXME(wuyi): data types in send_recv.proto is copied from - // framework.proto - request->set_data_type(static_cast(tensor.type())); - for (auto& dim : framework::vectorize(tensor.dims())) { - request->add_dims(dim); - } - const framework::LoD lod = tensor.lod(); - if (lod.size() > 0) { - request->set_lod_level(lod.size()); - for (auto& each : lod) { - VarMsg::LodData* lod_inner = request->add_lod(); - for (auto& d : each) { - lod_inner->add_lod_data(d); - } - } - } - return GetCommunicationAllocationFromTensor(ctx, tensor); -} - -TensorPayload GetSelectedRowsPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request) { - auto* slr = var->GetMutable(); - request->set_data_type(static_cast(slr->value().type())); - request->set_lod_level(0); - request->set_slr_height(slr->height()); - - for (auto& dim : framework::vectorize(slr->value().dims())) { - request->add_dims(dim); - } - - auto* tensor = slr->mutable_value(); - return GetCommunicationAllocationFromTensor(ctx, *tensor); -} - -TensorPayload::TensorPayload(std::shared_ptr allocation) - : allocation_(allocation), offset_(0), memory_size_(allocation->size()) {} -TensorPayload::TensorPayload(const framework::Tensor& tensor) - : allocation_(tensor.Holder()), - offset_(tensor.offset()), - memory_size_(tensor.numel() * framework::SizeOfType(tensor.type())) {} -void* TensorPayload::ptr() const { - return reinterpret_cast( - reinterpret_cast(allocation_->ptr()) + offset_); -} -size_t TensorPayload::memory_size() const { return memory_size_; } -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h deleted file mode 100644 index 84ed1ab024712..0000000000000 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/platform/port.h" - -namespace paddle { -namespace framework { -class Tensor; -class Variable; -} // namespace framework -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -using VarMsg = sendrecv::VariableMessage; - -class TensorPayload final { - public: - explicit TensorPayload(const framework::Tensor& tensor); - explicit TensorPayload(std::shared_ptr allocation); - - TensorPayload(const TensorPayload& o) = default; - TensorPayload& operator=(const TensorPayload& o) = default; - - void* ptr() const; - size_t memory_size() const; - - private: - std::shared_ptr allocation_; - size_t offset_; - size_t memory_size_; -}; - -inline void SerializeDestroyCallback(void* payload) { - if (payload != nullptr) { - auto* shared_payload = reinterpret_cast(payload); - delete shared_payload; - } -} - -TensorPayload GetTensorPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request); - -TensorPayload GetSelectedRowsPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request); - -inline framework::proto::VarType::Type ToVarType( - sendrecv::VariableMessage::Type type) { - switch (type) { - case sendrecv::VariableMessage::FP32: - return framework::proto::VarType::FP32; // NOLINT - case sendrecv::VariableMessage::FP64: - return framework::proto::VarType::FP64; // NOLINT - case sendrecv::VariableMessage::INT32: - return framework::proto::VarType::INT32; // NOLINT - case sendrecv::VariableMessage::INT64: - return framework::proto::VarType::INT64; // NOLINT - case sendrecv::VariableMessage::BOOL: - return framework::proto::VarType::BOOL; // NOLINT - default: - PADDLE_THROW( - platform::errors::InvalidArgument("Not support type id: %d.", type)); - } -} - -template