diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e808094dd96..1d3ce94e7ebf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -633,6 +633,10 @@ if(USE_CUDA) else() list(APPEND CUDA_INCLUDE_DIRS ${INCLUDE_DIRECTORIES}) # define preprocessor macro so that we will not include the generated forcelink header + if(ENABLE_CUDA_RTC) + add_definitions(-DMXNET_ENABLE_CUDA_RTC=1) + endif() + # Create '.cmake' files for cuda compiles given definitions added thus far mshadow_cuda_compile(cuda_objs ${CUDA}) if(MSVC) if(ENABLE_CUDA_RTC) @@ -640,7 +644,6 @@ if(USE_CUDA) list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY}) set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib") list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY}) - add_definitions(-DMXNET_ENABLE_CUDA_RTC=1) endif() FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32") list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator @@ -652,7 +655,6 @@ if(USE_CUDA) list(APPEND mxnet_LINKER_LIBS cufft cusolver) if(ENABLE_CUDA_RTC) list(APPEND mxnet_LINKER_LIBS nvrtc cuda) - add_definitions(-DMXNET_ENABLE_CUDA_RTC=1) endif() link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") endif() diff --git a/appveyor.yml b/appveyor.yml index d44f52a0a9a9..9fa495002a1f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -69,7 +69,7 @@ before_build: set OpenCV_DIR=%APPVEYOR_BUILD_FOLDER%/%MXNET_OPENCV_DIR%/build - cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64" + cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DENABLE_CUDA_RTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64" build_script: - cmd: >- diff --git a/ci/build_windows.py b/ci/build_windows.py index 4673bd535e3e..ce77c316ab20 100755 --- a/ci/build_windows.py +++ b/ci/build_windows.py @@ -54,7 +54,7 @@ class BuildFlavour(Enum): 'WIN_CPU': ( '-DUSE_CUDA=OFF ' '-DUSE_CUDNN=OFF ' - '-DUSE_NVRTC=OFF ' + '-DENABLE_CUDA_RTC=OFF ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=open ' @@ -67,7 +67,7 @@ class BuildFlavour(Enum): , 'WIN_CPU_MKLDNN': ( '-DUSE_CUDA=OFF ' '-DUSE_CUDNN=OFF ' - '-DUSE_NVRTC=OFF ' + '-DENABLE_CUDA_RTC=OFF ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=open ' @@ -80,7 +80,7 @@ class BuildFlavour(Enum): , 'WIN_CPU_MKLDNN_MKL': ( '-DUSE_CUDA=OFF ' '-DUSE_CUDNN=OFF ' - '-DUSE_NVRTC=OFF ' + '-DENABLE_CUDA_RTC=OFF ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=mkl ' @@ -93,7 +93,7 @@ class BuildFlavour(Enum): , 'WIN_CPU_MKL': ( '-DUSE_CUDA=OFF ' '-DUSE_CUDNN=OFF ' - '-DUSE_NVRTC=OFF ' + '-DENABLE_CUDA_RTC=OFF ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=mkl ' @@ -106,7 +106,7 @@ class BuildFlavour(Enum): , 'WIN_GPU': ( '-DUSE_CUDA=ON ' '-DUSE_CUDNN=ON ' - '-DUSE_NVRTC=ON ' + '-DENABLE_CUDA_RTC=ON ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=open ' @@ -122,7 +122,7 @@ class BuildFlavour(Enum): , 'WIN_GPU_MKLDNN': ( '-DUSE_CUDA=ON ' '-DUSE_CUDNN=ON ' - '-DUSE_NVRTC=ON ' + '-DENABLE_CUDA_RTC=ON ' '-DUSE_OPENCV=ON ' '-DUSE_OPENMP=ON ' '-DUSE_BLAS=open ' diff --git a/make/maven/maven_darwin_mkl.mk b/make/maven/maven_darwin_mkl.mk index a7f2bdb027d4..9bf3fc46ce0b 100644 --- a/make/maven/maven_darwin_mkl.mk +++ b/make/maven/maven_darwin_mkl.mk @@ -77,7 +77,7 @@ USE_CUDNN = 0 # CUDA_ARCH := # whether use cuda runtime compiling for writing kernels in native language (i.e. Python) -USE_NVRTC = 0 +ENABLE_CUDA_RTC = 0 # use openmp for parallelization USE_OPENMP = 0 diff --git a/make/maven/maven_linux_cu90mkl.mk b/make/maven/maven_linux_cu90mkl.mk index e9ba46509973..e8caf73f186e 100644 --- a/make/maven/maven_linux_cu90mkl.mk +++ b/make/maven/maven_linux_cu90mkl.mk @@ -80,7 +80,7 @@ USE_NCCL = 1 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python) USE_NVTX=1 -USE_NVRTC = 1 +ENABLE_CUDA_RTC = 1 # use openmp for parallelization USE_OPENMP = 1 diff --git a/make/maven/maven_linux_cu92mkl.mk b/make/maven/maven_linux_cu92mkl.mk index caa1c59c01d5..930341e71cb1 100644 --- a/make/maven/maven_linux_cu92mkl.mk +++ b/make/maven/maven_linux_cu92mkl.mk @@ -80,7 +80,7 @@ USE_NCCL = 1 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python) USE_NVTX=1 -USE_NVRTC = 1 +ENABLE_CUDA_RTC = 1 # use openmp for parallelization USE_OPENMP = 1 diff --git a/make/maven/maven_linux_mkl.mk b/make/maven/maven_linux_mkl.mk index 3c8534a7e2aa..10aee5f35a46 100644 --- a/make/maven/maven_linux_mkl.mk +++ b/make/maven/maven_linux_mkl.mk @@ -76,7 +76,7 @@ USE_CUDNN = 0 # CUDA_ARCH := # whether use cuda runtime compiling for writing kernels in native language (i.e. Python) -USE_NVRTC = 0 +ENABLE_CUDA_RTC = 0 # use openmp for parallelization USE_OPENMP = 1 diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index a5f125affcb0..55d431cf3298 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -221,6 +221,11 @@ Graph FusePointwiseForward(Graph&& g); */ Graph FusePointwiseBackward(Graph&& g); +/*! + * \brief Issue a one-time warning that fusion is not possible for this platform or build. + */ +void WarnFusionNotSupported(); + /*! * \brief Infer shapes in the graph given the information. * \param graph The input graph. diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index b31ae7cf0cd7..be3df765e3d2 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -999,7 +999,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, // setup gradient nnvm::Graph g = InitFullGraph(symbol, grad_req_types); -#if MXNET_USE_CUDA && !defined(_WIN32) +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", true)) { nnvm::Graph unoptimized_graph; common::CopyGraph(&unoptimized_graph, g, false); @@ -1032,7 +1032,12 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; } } -#endif // MXNET_USE_CUDA +#else + // Only warn user if MXNET_USE_FUSION env var is explicitly set + if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", false)) { + WarnFusionNotSupported(); + } +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) // create "device" and "context" attrs for the graph g = AssignContext(g, default_ctx, ctx_map, diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 6fe21402cb3a..6a0d5f4efe87 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -36,10 +36,26 @@ #include "../operator/fusion/fused_op.h" #include "../operator/operator_common.h" -#if MXNET_USE_CUDA - namespace mxnet { namespace exec { + +void WarnFusionNotSupported() { + static bool issued_warning = false; + if (!issued_warning) { + issued_warning = true; +#if defined(_WIN32) + LOG(WARNING) << "Omitting dynamic fused op creation- not enabled on Windows. " + << "Unset env var MXNET_USE_FUSION=1 to quiet this message."; +#else + LOG(WARNING) << "Omitting dynamic fused op creation- needs MXNet lib built with " + << "USE_CUDA=1 and ENABLE_CUDA_RTC=1. Unset env var MXNET_USE_FUSION=1 " + << "to quiet this message."; +#endif // defined(_WIN32) + } +} + +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC + namespace { bool IsFusionCompatible(nnvm::Node* n) { using namespace mxnet::fusion; @@ -304,8 +320,8 @@ Graph FusePointwiseBackward(Graph &&g) { ret.outputs = g.outputs; return ret; } +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC } // namespace exec } // namespace mxnet -#endif // MXNET_USE_CUDA diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index 269729c18f58..24270f210888 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -167,10 +167,8 @@ void SetRefCounts(nnvm::Graph* fwd_graph, const nnvm::Graph& full_graph) { void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Graph * grad_graph, const Context& context, size_t num_forward_outputs, const bool inlining) { -#if MXNET_USE_CUDA && !defined(_WIN32) - if (context.dev_mask() == kGPU && - !inlining && - dmlc::GetEnv("MXNET_USE_FUSION", true)) { +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) + if (context.dev_mask() == kGPU && !inlining && dmlc::GetEnv("MXNET_USE_FUSION", true)) { nnvm::Graph unoptimized_graph; common::CopyGraph(&unoptimized_graph, *full_graph, false); @@ -202,7 +200,12 @@ void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Grap << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!"; } } -#endif // MXNET_USE_CUDA +#else + // Only warn user if MXNET_USE_FUSION env var is explicitly set + if (context.dev_mask() == kGPU && !inlining && dmlc::GetEnv("MXNET_USE_FUSION", false)) { + exec::WarnFusionNotSupported(); + } +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32) *fwd_graph = nnvm::Graph(); fwd_graph->outputs = std::vector(full_graph->outputs.begin(), diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h index 2966fe2ae910..e86ce7682ad8 100644 --- a/src/operator/fusion/fused_op-inl.h +++ b/src/operator/fusion/fused_op-inl.h @@ -24,7 +24,7 @@ #include #include -#if MXNET_USE_CUDA +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC namespace mxnet { @@ -992,6 +992,6 @@ const char kernel_end[] = R"code(} } // namespace mxnet -#endif // MXNET_USE_CUDA +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC #endif // MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_ diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc index 5c83c30308c7..5e2d782dd9e0 100644 --- a/src/operator/fusion/fused_op.cc +++ b/src/operator/fusion/fused_op.cc @@ -23,7 +23,7 @@ #include "../operator_common.h" #include "../../executor/exec_pass.h" -#if MXNET_USE_CUDA +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC namespace mxnet { @@ -302,4 +302,4 @@ NNVM_REGISTER_OP(_FusedOpOutHelper) } // namespace mxnet -#endif // MXNET_USE_CUDA +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu index 78988f13510e..62f340d0e00b 100644 --- a/src/operator/fusion/fused_op.cu +++ b/src/operator/fusion/fused_op.cu @@ -17,6 +17,9 @@ * under the License. */ +// Additional use of MXNET_USE_CUDA is not needed to guard a '.cu' file. +#if MXNET_ENABLE_CUDA_RTC + #include #include #include @@ -787,3 +790,5 @@ NNVM_REGISTER_OP(_FusedOp) .set_attr("FCompute", FusedOpForwardGPU); } // namespace mxnet + +#endif // MXNET_ENABLE_CUDA_RTC diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h index 24603ac1932f..7d714677e941 100644 --- a/src/operator/fusion/fused_op.h +++ b/src/operator/fusion/fused_op.h @@ -20,7 +20,6 @@ #ifndef MXNET_OPERATOR_FUSION_FUSED_OP_H_ #define MXNET_OPERATOR_FUSION_FUSED_OP_H_ - #include #include #include @@ -29,8 +28,7 @@ #include #include -#if MXNET_USE_CUDA - +#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC namespace mxnet { @@ -202,5 +200,6 @@ using FusedOpHelperParamPtr = std::shared_ptr; } // namespace mxnet -#endif // MXNET_USE_CUDA +#endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC + #endif // MXNET_OPERATOR_FUSION_FUSED_OP_H_