apache · KellenSunderland · Jul 12, 2019 · Jul 2, 2019 · Jul 3, 2019 · Jul 4, 2019
@@ -242,6 +242,16 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
 	- If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
 	- If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.
 
+* MXNET_CUDA_VERSION_CHECKING
+  - 0(false) or 1(true) ```(default=1)```
+  - If set to '0', disallows various runtime checks of the cuda library version and associated warning messages.
+  - If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)
+
+* MXNET_CUDNN_VERSION_CHECKING
+  - 0(false) or 1(true) ```(default=1)```
+  - If set to '0', disallows various runtime checks of the cuDNN library version and associated warning messages.
+  - If set to '1', permits these checks (e.g. compile vs. link mismatch, old version no longer CI-tested)
+
 * MXNET_GLUON_REPO
   - Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'```
   - The repository url to be used for Gluon datasets and pre-trained models.

diff --git a/src/common/cuda_utils.cc b/src/common/cuda_utils.cc
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file cuda_utils.cc
+ * \brief CUDA debugging utilities.
+ */
+
+#include <mxnet/base.h>
+#include "cuda_utils.h"
+
+#if MXNET_USE_CUDA == 1
+
+namespace mxnet {
+namespace common {
+namespace cuda {
+
+// The oldest version of cuda used in upstream MXNet CI testing, both for unix and windows.
+// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade
+// their systems to match the CI level.  Minimally, users should rerun the CI locally.
+#if defined(_MSC_VER)
+#define MXNET_CI_OLDEST_CUDA_VERSION  9020
+#else
+#define MXNET_CI_OLDEST_CUDA_VERSION 10000
+#endif
+
+
+// Start-up check that the version of cuda compiled-against matches the linked-against version.
+bool CudaVersionChecks() {
+  // Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="")
+  if (dmlc::GetEnv("MXNET_CUDA_VERSION_CHECKING", true) && Context::GetGPUCount() > 0) {
+    int linkedAgainstCudaVersion = 0;
+    CUDA_CALL(cudaRuntimeGetVersion(&linkedAgainstCudaVersion));
+    if (linkedAgainstCudaVersion != CUDA_VERSION)
+      LOG(WARNING) << "cuda library mismatch: linked-against version " << linkedAgainstCudaVersion
+                   << " != compiled-against version " << CUDA_VERSION << "."
+                   << "Set MXNET_CUDA_VERSION_CHECKING=0 to quiet this warning.";
+    if (CUDA_VERSION < MXNET_CI_OLDEST_CUDA_VERSION)
+      LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuda library version "
+                   << CUDA_VERSION << ", which is older than the oldest version tested by CI ("
+                   << MXNET_CI_OLDEST_CUDA_VERSION << ").  "
+                   << "Set MXNET_CUDA_VERSION_CHECKING=0 to quiet this warning.";
+  }
+  return true;
+}
+
+// Dynamic initialization here will emit a warning if runtime and compile-time versions mismatch.
+// Also if the user has recompiled their source to a version no longer tested by upstream CI.
+bool cuda_version_ok = CudaVersionChecks();
+
+}  // namespace cuda
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+
+#if MXNET_USE_CUDNN == 1
+
+namespace mxnet {
+namespace common {
+namespace cudnn {
+
+// The oldest version of CUDNN used in upstream MXNet CI testing, both for unix and windows.
+// Users that have rebuilt MXNet against older versions will we advised with a warning to upgrade
+// their systems to match the CI level.  Minimally, users should rerun the CI locally.
+#if defined(_MSC_VER)
+#define MXNET_CI_OLDEST_CUDNN_VERSION 7600
+#else
+#define MXNET_CI_OLDEST_CUDNN_VERSION 7600
+#endif
+
+// Start-up check that the version of cudnn compiled-against matches the linked-against version.
+// Also if the user has recompiled their source to a version no longer tested by upstream CI.
+bool CuDNNVersionChecks() {
+  // Don't bother with checks if there are no GPUs visible (e.g. with CUDA_VISIBLE_DEVICES="")
+  if (dmlc::GetEnv("MXNET_CUDNN_VERSION_CHECKING", true) && Context::GetGPUCount() > 0) {
+    size_t linkedAgainstCudnnVersion = cudnnGetVersion();
+    if (linkedAgainstCudnnVersion != CUDNN_VERSION)
+      LOG(WARNING) << "cuDNN library mismatch: linked-against version " << linkedAgainstCudnnVersion
+                   << " != compiled-against version " << CUDNN_VERSION << ".  "
+                   << "Set MXNET_CUDNN_VERSION_CHECKING=0 to quiet this warning.";
+    if (CUDNN_VERSION < MXNET_CI_OLDEST_CUDNN_VERSION)
+      LOG(WARNING) << "Upgrade advisory: this mxnet has been built against cuDNN library version "
+                   <<  CUDNN_VERSION << ", which is older than the oldest version tested by CI ("
+                   << MXNET_CI_OLDEST_CUDNN_VERSION << ").  "
+                   << "Set MXNET_CUDNN_VERSION_CHECKING=0 to quiet this warning.";
+  }
+  return true;
+}
+
+// Dynamic initialization here will emit a warning if runtime and compile-time versions mismatch.
+// Also if the user has recompiled their source to a version no longer tested by upstream CI.
+bool cudnn_version_ok = CuDNNVersionChecks();
+
+}  // namespace cudnn
+}  // namespace common
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDNN
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
@@ -47,12 +47,20 @@ extern __cuda_fake_struct threadIdx;
 extern __cuda_fake_struct blockIdx;
 #endif
 
+#define QUOTE(x) #x
+#define QUOTEVALUE(x) QUOTE(x)
+
 #if MXNET_USE_CUDA
 
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <curand.h>
 
+#define STATIC_ASSERT_CUDA_VERSION_GE(min_version) \
+  static_assert(CUDA_VERSION >= min_version, "Compiled-against CUDA version " \
+      QUOTEVALUE(CUDA_VERSION) " is too old, please upgrade system to version " \
+      QUOTEVALUE(min_version) " or later.")
+
 /*!
  * \brief When compiling a __device__ function, check that the architecture is >= Kepler (3.0)
  *        Note that __CUDA_ARCH__ is not defined outside of a __device__ function
@@ -441,6 +449,25 @@ inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t n
 
 #include <cudnn.h>
 
+// Creating CUDNN_VERSION_AS_STRING as follows avoids a static_assert error message that shows
+// the formula for CUDNN_VERSION, i.e. "1000 * 7 + 100 * 6 + 0" rather than number "7600".
+static_assert(CUDNN_PATCHLEVEL < 100 && CUDNN_MINOR < 10,
+              "CUDNN_VERSION_AS_STRING macro assumptions violated.");
+#if CUDNN_PATCHLEVEL >= 10
+#define CUDNN_VERSION_AS_STRING QUOTEVALUE(CUDNN_MAJOR) \
+                                QUOTEVALUE(CUDNN_MINOR) \
+                                QUOTEVALUE(CUDNN_PATCHLEVEL)
+#else
+#define CUDNN_VERSION_AS_STRING QUOTEVALUE(CUDNN_MAJOR) \
+                                QUOTEVALUE(CUDNN_MINOR) \
+                                "0" QUOTEVALUE(CUDNN_PATCHLEVEL)
+#endif
+
+#define STATIC_ASSERT_CUDNN_VERSION_GE(min_version) \
+  static_assert(CUDNN_VERSION >= min_version, "Compiled-against cuDNN version " \
+      CUDNN_VERSION_AS_STRING " is too old, please upgrade system to version " \
+      QUOTEVALUE(min_version) " or later.")
+
 #define CUDNN_CALL(func)                                                      \
   {                                                                           \
     cudnnStatus_t e = (func);                                                 \

diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
@@ -172,6 +172,7 @@ static std::vector<ResourceRequest> RNNResourceEx(const NodeAttrs& attrs, const
   std::vector<ResourceRequest> request;
   if (dev_mask == kGPU) {
 #if MXNET_USE_CUDNN_RNN
+    STATIC_ASSERT_CUDNN_VERSION_GE(7000);
     request.emplace_back(ResourceRequest::kTempSpace);
 
     const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);