From f503f129250cae7e2d0531e93c3d948bc1f906ef Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 20 Jun 2018 11:29:22 +0800 Subject: [PATCH 1/5] enable dynamic load mklml lib on fluid --- cmake/external/openblas.cmake | 7 +- .../tests/book/test_inference_nlp.cc | 4 +- paddle/fluid/operators/math/blas.h | 7 +- paddle/fluid/operators/math/blas_impl.h | 91 ++++++++++++++----- paddle/fluid/operators/math/math_function.h | 4 +- paddle/fluid/platform/dynload/CMakeLists.txt | 4 + .../fluid/platform/dynload/dynamic_loader.cc | 10 ++ .../fluid/platform/dynload/dynamic_loader.h | 1 + paddle/fluid/platform/dynload/mklml.cc | 30 ++++++ paddle/fluid/platform/dynload/mklml.h | 71 +++++++++++++++ 10 files changed, 193 insertions(+), 36 deletions(-) create mode 100644 paddle/fluid/platform/dynload/mklml.cc create mode 100644 paddle/fluid/platform/dynload/mklml.h diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 4a49a92f2b131..ce6a88b51dc98 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -114,7 +114,12 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";") ADD_LIBRARY(cblas STATIC ${dummyfile}) -TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) + +IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") + TARGET_LINK_LIBRARIES(cblas dynload_mklml) +ELSE() + TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) +ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML") IF(NOT ${CBLAS_FOUND}) ADD_DEPENDENCIES(cblas extern_openblas) diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index cbba8b9d559e0..03b0b69463397 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/inference/tests/test_helper.h" +#include "paddle/fluid/operators/math/blas.h" #ifdef PADDLE_WITH_MKLML -#include #include #endif @@ -164,7 +164,7 @@ TEST(inference, nlp) { // only use 1 thread number per std::thread omp_set_dynamic(0); omp_set_num_threads(1); - mkl_set_num_threads(1); + paddle::operators::math::SetNumThreads(1); #endif double start_ms = 0, stop_ms = 0; diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 6207d14ecdc92..a907d6a71b7a1 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -18,10 +18,7 @@ #include "paddle/fluid/framework/tensor.h" #ifdef PADDLE_WITH_MKLML -#include -#include -#include -#include +#include "paddle/fluid/platform/dynload/mklml.h" #endif #ifdef PADDLE_USE_OPENBLAS @@ -55,7 +52,7 @@ static void SetNumThreads(int num_threads) { openblas_set_num_threads(real_num_threads); #elif defined(PADDLE_WITH_MKLML) int real_num_threads = num_threads > 1 ? num_threads : 1; - mkl_set_num_threads(real_num_threads); + platform::dynload::MKL_Set_Num_Threads(real_num_threads); #else PADDLE_ENFORCE(false, "To be implemented."); #endif diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index ae20406bc21d5..2ce94cfc93823 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -22,61 +22,109 @@ namespace math { template struct CBlas; +#ifdef PADDLE_WITH_MKLML template <> struct CBlas { template static void GEMM(ARGS... args) { - cblas_sgemm(args...); + platform::dynload::cblas_sgemm(args...); } template static void AXPY(ARGS... args) { - cblas_saxpy(args...); + platform::dynload::cblas_saxpy(args...); + } + + template + static void VCOPY(ARGS... args) { + platform::dynload::cblas_scopy(args...); + } + + template + static void GEMV(ARGS... args) { + platform::dynload::cblas_sgemv(args...); + } + + template + static void GEMM_BATCH(ARGS... args) { + platform::dynload::cblas_sgemm_batch(args...); } -#ifdef PADDLE_WITH_MKLML template static void VADD(ARGS... args) { - vsAdd(args...); + platform::dynload::vsAdd(args...); + } +}; + +template <> +struct CBlas { + template + static void GEMM(ARGS... args) { + platform::dynload::cblas_dgemm(args...); + } + + template + static void AXPY(ARGS... args) { + platform::dynload::cblas_daxpy(args...); } -#endif template static void VCOPY(ARGS... args) { - cblas_scopy(args...); + platform::dynload::cblas_dcopy(args...); } template static void GEMV(ARGS... args) { - cblas_sgemv(args...); + platform::dynload::cblas_dgemv(args...); } -#ifdef PADDLE_WITH_MKLML template static void GEMM_BATCH(ARGS... args) { - cblas_sgemm_batch(args...); + platform::dynload::cblas_dgemm_batch(args...); + } + + template + static void VADD(ARGS... args) { + platform::dynload::vdAdd(args...); } -#endif }; +#else + template <> -struct CBlas { +struct CBlas { template static void GEMM(ARGS... args) { - cblas_dgemm(args...); + cblas_sgemm(args...); } template static void AXPY(ARGS... args) { - cblas_daxpy(args...); + cblas_saxpy(args...); } -#ifdef PADDLE_WITH_MKLML template - static void VADD(ARGS... args) { - vdAdd(args...); + static void VCOPY(ARGS... args) { + cblas_scopy(args...); + } + + template + static void GEMV(ARGS... args) { + cblas_sgemv(args...); + } +}; + +template <> +struct CBlas { + template + static void GEMM(ARGS... args) { + cblas_dgemm(args...); + } + + template + static void AXPY(ARGS... args) { + cblas_daxpy(args...); } -#endif template static void VCOPY(ARGS... args) { @@ -87,15 +135,8 @@ struct CBlas { static void GEMV(ARGS... args) { cblas_dgemv(args...); } - -#ifdef PADDLE_WITH_MKLML - template - static void GEMM_BATCH(ARGS... args) { - cblas_dgemm_batch(args...); - } -#endif }; - +#endif template <> struct CBlas { static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h index 8b296b6a07ca2..56a039d3cec73 100644 --- a/paddle/fluid/operators/math/math_function.h +++ b/paddle/fluid/operators/math/math_function.h @@ -14,9 +14,7 @@ limitations under the License. */ #pragma once #ifdef PADDLE_WITH_MKLML -#include -#include -#include +#include "paddle/fluid/platform/dynload/mklml.h" #endif #ifdef PADDLE_USE_OPENBLAS diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 364c4901b297d..68fa576543410 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -12,3 +12,7 @@ if (CUPTI_FOUND) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +if (WITH_MKLML) + cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) +endif() +# TODO(TJ): add iomp, mkldnn? diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 19c01dc5a968c..34fbccddc2bc2 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -49,6 +49,8 @@ DEFINE_string( tensorrt_dir, "", "Specify path for loading tensorrt library, such as libnvinfer.so."); +DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); + namespace paddle { namespace platform { namespace dynload { @@ -206,6 +208,14 @@ void* GetTensorRtDsoHandle() { #endif } +void* GetMKLMLDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so"); +#endif +} + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 0de3559b60880..ca87dc47f355a 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -26,6 +26,7 @@ void* GetWarpCTCDsoHandle(); void* GetLapackDsoHandle(); void* GetNCCLDsoHandle(); void* GetTensorRtDsoHandle(); +void* GetMKLMLDsoHandle(); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc new file mode 100644 index 0000000000000..0f61a5e09b324 --- /dev/null +++ b/paddle/fluid/platform/dynload/mklml.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/mklml.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag mklml_dso_flag; +void* mklml_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MKLML_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h new file mode 100644 index 0000000000000..17acefe8cde01 --- /dev/null +++ b/paddle/fluid/platform/dynload/mklml.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag mklml_dso_flag; +extern void* mklml_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load mklml routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_MKLML_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using mklmlFunc = decltype(&::__name); \ + std::call_once(mklml_dso_flag, []() { \ + mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \ + }); \ + static void* p_##_name = dlsym(mklml_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name) + +#define MKLML_ROUTINE_EACH(__macro) \ + __macro(cblas_sgemm); \ + __macro(cblas_saxpy); \ + __macro(cblas_scopy); \ + __macro(cblas_sgemv); \ + __macro(cblas_sgemm_batch); \ + __macro(cblas_dgemm); \ + __macro(cblas_daxpy); \ + __macro(cblas_dcopy); \ + __macro(cblas_dgemv); \ + __macro(cblas_dgemm_batch); \ + __macro(vsAdd); \ + __macro(vdAdd); \ + __macro(MKL_Set_Num_Threads) + +MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); + +#undef DYNAMIC_LOAD_MKLML_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle From 3e73a7a924937fc6cae409b36da3a60a555cd1a1 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 20 Jun 2018 16:00:00 +0800 Subject: [PATCH 2/5] add usr local lib to dynamic search path --- paddle/fluid/platform/dynload/dynamic_loader.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 34fbccddc2bc2..7b0adf25ac06d 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -78,7 +78,12 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, VLOG(3) << "Try to find library: " << dso_path << " from default system path."; // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH + // and /usr/local/lib path void* dso_handle = dlopen(dso_path.c_str(), dynload_flags); + if (nullptr == dso_handle) { + dso_handle = + dlopen(join("/usr/local/lib/", dso_path).c_str(), dynload_flags); + } // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to // bring System Integrity Projection (SIP), if dso_handle @@ -99,6 +104,10 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, } #endif + if (nullptr == dso_handle) { + LOG(WARNING) << "Can not find library: " << dso_path + << ". Please try to set add the lib path to LD_LIBRARY_PATH."; + } return dso_handle; } From 17c9e3d223ed249c4b7bd6adbe2b8dd1b30773e7 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 20 Jun 2018 21:17:35 +0800 Subject: [PATCH 3/5] only deps mklml so, do not need link it --- cmake/generic.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 0e2df86c19086..fc2094b5077d6 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -195,6 +195,14 @@ function(cc_library TARGET_NAME) list(REMOVE_ITEM cc_library_DEPS warpctc) add_dependencies(${TARGET_NAME} warpctc) endif() + # Only deps libmklml.so, not link + if("${cc_library_DEPS};" MATCHES "mklml;") + list(REMOVE_ITEM cc_library_DEPS mklml) + if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml") + list(APPEND cc_library_DEPS dynload_mklml) + endif() + add_dependencies(${TARGET_NAME} mklml) + endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) endif() From c475041405e55d0a587358823387af4b70303ba5 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 21 Jun 2018 13:41:31 +0800 Subject: [PATCH 4/5] link iomp as needed --- cmake/generic.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index fc2094b5077d6..9c42044ec163e 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -202,6 +202,7 @@ function(cc_library TARGET_NAME) list(APPEND cc_library_DEPS dynload_mklml) endif() add_dependencies(${TARGET_NAME} mklml) + target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) From 28a0ef9522c65128b3afe5a1835bbc9a836c4b71 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 21 Jun 2018 16:34:16 +0800 Subject: [PATCH 5/5] remove usr local lib when dynamic load lib --- paddle/fluid/platform/dynload/dynamic_loader.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 7b0adf25ac06d..198d8566b1bd7 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -80,10 +80,6 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH // and /usr/local/lib path void* dso_handle = dlopen(dso_path.c_str(), dynload_flags); - if (nullptr == dso_handle) { - dso_handle = - dlopen(join("/usr/local/lib/", dso_path).c_str(), dynload_flags); - } // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to // bring System Integrity Projection (SIP), if dso_handle @@ -106,7 +102,7 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, if (nullptr == dso_handle) { LOG(WARNING) << "Can not find library: " << dso_path - << ". Please try to set add the lib path to LD_LIBRARY_PATH."; + << ". Please try to add the lib path to LD_LIBRARY_PATH."; } return dso_handle; }