apache · ptrendx · Nov 20, 2019 · Nov 16, 2019 · Nov 16, 2019 · Nov 19, 2019
@@ -633,14 +633,17 @@ if(USE_CUDA)
   else()
     list(APPEND CUDA_INCLUDE_DIRS ${INCLUDE_DIRECTORIES})
     # define preprocessor macro so that we will not include the generated forcelink header
+    if(ENABLE_CUDA_RTC)
+      add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
+    endif()
+    # Create '.cmake' files for cuda compiles given definitions added thus far
     mshadow_cuda_compile(cuda_objs ${CUDA})
     if(MSVC)
         if(ENABLE_CUDA_RTC)
             FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
             list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
             set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
             list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
-            add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
         endif()
         FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
         list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
@@ -652,7 +655,6 @@ if(USE_CUDA)
         list(APPEND mxnet_LINKER_LIBS cufft cusolver)
         if(ENABLE_CUDA_RTC)
             list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
-            add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
         endif()
         link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
     endif()

diff --git a/appveyor.yml b/appveyor.yml
@@ -69,7 +69,7 @@ before_build:
 
         set OpenCV_DIR=%APPVEYOR_BUILD_FOLDER%/%MXNET_OPENCV_DIR%/build
 
-        cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64"
+        cmake .. -DOPENCV_DIR=%OpenCV_DIR% -DUSE_PROFILER=1 -DUSE_CUDA=0 -DUSE_CUDNN=0 -DENABLE_CUDA_RTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -G "Visual Studio 12 2013 Win64"
 
 build_script:
     - cmd: >-

@@ -54,7 +54,7 @@ class BuildFlavour(Enum):
     'WIN_CPU': (
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DUSE_NVRTC=OFF '
+        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -67,7 +67,7 @@ class BuildFlavour(Enum):
     , 'WIN_CPU_MKLDNN': (
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DUSE_NVRTC=OFF '
+        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -80,7 +80,7 @@ class BuildFlavour(Enum):
     , 'WIN_CPU_MKLDNN_MKL': (
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DUSE_NVRTC=OFF '
+        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=mkl '
@@ -93,7 +93,7 @@ class BuildFlavour(Enum):
     , 'WIN_CPU_MKL': (
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DUSE_NVRTC=OFF '
+        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=mkl '
@@ -106,7 +106,7 @@ class BuildFlavour(Enum):
     , 'WIN_GPU': (
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
-        '-DUSE_NVRTC=ON '
+        '-DENABLE_CUDA_RTC=ON '
         '-DUSE_OPENCV=ON  '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -122,7 +122,7 @@ class BuildFlavour(Enum):
     , 'WIN_GPU_MKLDNN': (
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
-        '-DUSE_NVRTC=ON '
+        '-DENABLE_CUDA_RTC=ON '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '

@@ -77,7 +77,7 @@ USE_CUDNN = 0
 # CUDA_ARCH :=
 
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-USE_NVRTC = 0
+ENABLE_CUDA_RTC = 0
 
 # use openmp for parallelization
 USE_OPENMP = 0

@@ -80,7 +80,7 @@ USE_NCCL = 1
 
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
 USE_NVTX=1
-USE_NVRTC = 1
+ENABLE_CUDA_RTC = 1
 
 # use openmp for parallelization
 USE_OPENMP = 1

@@ -80,7 +80,7 @@ USE_NCCL = 1
 
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
 USE_NVTX=1
-USE_NVRTC = 1
+ENABLE_CUDA_RTC = 1
 
 # use openmp for parallelization
 USE_OPENMP = 1

@@ -76,7 +76,7 @@ USE_CUDNN = 0
 # CUDA_ARCH :=
 
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-USE_NVRTC = 0
+ENABLE_CUDA_RTC = 0
 
 # use openmp for parallelization
 USE_OPENMP = 1

diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
@@ -221,6 +221,11 @@ Graph FusePointwiseForward(Graph&& g);
  */
 Graph FusePointwiseBackward(Graph&& g);
 
+/*!
+ * \brief Issue a one-time warning that fusion is not possible for this platform or build.
+ */
+void WarnFusionNotSupported();
+
 /*!
  * \brief Infer shapes in the graph given the information.
  * \param graph The input graph.

diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
@@ -999,7 +999,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
   // setup gradient
   nnvm::Graph g = InitFullGraph(symbol, grad_req_types);
 
-#if MXNET_USE_CUDA && !defined(_WIN32)
+#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)
   if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", true)) {
     nnvm::Graph unoptimized_graph;
     common::CopyGraph(&unoptimized_graph, g, false);
@@ -1032,7 +1032,12 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
         << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!";
      }
   }
-#endif  // MXNET_USE_CUDA
+#else
+  // Only warn user if MXNET_USE_FUSION env var is explicitly set
+  if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", false)) {
+    WarnFusionNotSupported();
+  }
+#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)
 
   // create "device" and "context" attrs for the graph
   g = AssignContext(g, default_ctx, ctx_map,

diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc
@@ -36,10 +36,26 @@
 #include "../operator/fusion/fused_op.h"
 #include "../operator/operator_common.h"
 
-#if MXNET_USE_CUDA
-
 namespace mxnet {
 namespace exec {
+
+void WarnFusionNotSupported() {
+  static bool issued_warning = false;
+  if (!issued_warning) {
+    issued_warning = true;
+#if defined(_WIN32)
+    LOG(WARNING) << "Omitting dynamic fused op creation- not enabled on Windows.  "
+                 << "Unset env var MXNET_USE_FUSION=1 to quiet this message.";
+#else
+    LOG(WARNING) << "Omitting dynamic fused op creation- needs MXNet lib built with "
+                   << "USE_CUDA=1 and ENABLE_CUDA_RTC=1.  Unset env var MXNET_USE_FUSION=1 "
+                   << "to quiet this message.";
+#endif  // defined(_WIN32)
+  }
+}
+
+#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+
 namespace {
   bool IsFusionCompatible(nnvm::Node* n) {
     using namespace mxnet::fusion;
@@ -304,8 +320,8 @@ Graph FusePointwiseBackward(Graph &&g) {
   ret.outputs = g.outputs;
   return ret;
 }
+#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
 
 }  // namespace exec
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
@@ -167,10 +167,8 @@ void SetRefCounts(nnvm::Graph* fwd_graph, const nnvm::Graph& full_graph) {
 
 void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Graph * grad_graph,
                    const Context& context, size_t num_forward_outputs, const bool inlining) {
-#if MXNET_USE_CUDA && !defined(_WIN32)
-  if (context.dev_mask() == kGPU &&
-      !inlining &&
-      dmlc::GetEnv("MXNET_USE_FUSION", true)) {
+#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)
+  if (context.dev_mask() == kGPU && !inlining && dmlc::GetEnv("MXNET_USE_FUSION", true)) {
     nnvm::Graph unoptimized_graph;
     common::CopyGraph(&unoptimized_graph, *full_graph, false);
 
@@ -202,7 +200,12 @@ void OptimizeGraph(nnvm::Graph * full_graph, nnvm::Graph * fwd_graph, nnvm::Grap
         << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!";
      }
   }
-#endif  // MXNET_USE_CUDA
+#else
+  // Only warn user if MXNET_USE_FUSION env var is explicitly set
+  if (context.dev_mask() == kGPU && !inlining && dmlc::GetEnv("MXNET_USE_FUSION", false)) {
+    exec::WarnFusionNotSupported();
+  }
+#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC && !defined(_WIN32)
 
   *fwd_graph = nnvm::Graph();
   fwd_graph->outputs = std::vector<nnvm::NodeEntry>(full_graph->outputs.begin(),

diff --git a/src/operator/fusion/fused_op-inl.h b/src/operator/fusion/fused_op-inl.h
@@ -24,7 +24,7 @@
 #include <map>
 #include <vector>
 
-#if MXNET_USE_CUDA
+#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
 
 namespace mxnet {
 
@@ -992,6 +992,6 @@ const char kernel_end[] = R"code(}
 
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA
+#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
 
 #endif  // MXNET_OPERATOR_FUSION_FUSED_OP_INL_H_
diff --git a/src/operator/fusion/fused_op.cc b/src/operator/fusion/fused_op.cc
@@ -23,7 +23,7 @@
 #include "../operator_common.h"
 #include "../../executor/exec_pass.h"
 
-#if MXNET_USE_CUDA
+#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
 
 namespace mxnet {
 
@@ -302,4 +302,4 @@ NNVM_REGISTER_OP(_FusedOpOutHelper)
 
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA
+#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
diff --git a/src/operator/fusion/fused_op.cu b/src/operator/fusion/fused_op.cu
@@ -17,6 +17,9 @@
  * under the License.
  */
 
+// Additional use of MXNET_USE_CUDA is not needed to guard a '.cu' file.
+#if MXNET_ENABLE_CUDA_RTC
+
 #include <sys/stat.h>
 #include <nvrtc.h>
 #include <cuda.h>
@@ -787,3 +790,5 @@ NNVM_REGISTER_OP(_FusedOp)
 .set_attr<FCompute>("FCompute<gpu>", FusedOpForwardGPU);
 
 }  // namespace mxnet
+
+#endif  // MXNET_ENABLE_CUDA_RTC
diff --git a/src/operator/fusion/fused_op.h b/src/operator/fusion/fused_op.h
@@ -20,7 +20,6 @@
 #ifndef MXNET_OPERATOR_FUSION_FUSED_OP_H_
 #define MXNET_OPERATOR_FUSION_FUSED_OP_H_
 
-
 #include <mxnet/operator.h>
 #include <nnvm/graph.h>
 #include <vector>
@@ -29,8 +28,7 @@
 #include <mutex>
 #include <tuple>
 
-#if MXNET_USE_CUDA
-
+#if MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
 
 namespace mxnet {
 
@@ -202,5 +200,6 @@ using FusedOpHelperParamPtr = std::shared_ptr<FusedOpHelperParam>;
 
 }  // namespace mxnet
 
-#endif  // MXNET_USE_CUDA
+#endif  // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC
+
 #endif  // MXNET_OPERATOR_FUSION_FUSED_OP_H_