dmlc · mfbalin · Jun 27, 2024 · Jun 26, 2024 · Jun 26, 2024
diff --git a/graphbolt/src/cuda/unique_and_compact_impl.cu b/graphbolt/src/cuda/unique_and_compact_impl.cu
@@ -277,22 +277,7 @@ UniqueAndCompactBatched(
     const std::vector<torch::Tensor>& src_ids,
     const std::vector<torch::Tensor>& dst_ids,
     const std::vector<torch::Tensor>& unique_dst_ids, int num_bits) {
-  auto dev_id = cuda::GetCurrentStream().device_index();
-  static std::mutex mtx;
-  static std::unordered_map<decltype(dev_id), int> compute_capability_cache;
-  const auto compute_capability_major = [&] {
-    std::lock_guard lock(mtx);
-    auto it = compute_capability_cache.find(dev_id);
-    if (it != compute_capability_cache.end()) {
-      return it->second;
-    } else {
-      int major;
-      CUDA_RUNTIME_CHECK(cudaDeviceGetAttribute(
-          &major, cudaDevAttrComputeCapabilityMajor, dev_id));
-      return compute_capability_cache[dev_id] = major;
-    }
-  }();
-  if (compute_capability_major >= 7) {
+  if (cuda::compute_capability() >= 70) {
     // Utilizes a hash table based implementation, the mapped id of a vertex
     // will be monotonically increasing as the first occurrence index of it in
     // torch.cat([unique_dst_ids, src_ids]). Thus, it is deterministic.

diff --git a/graphbolt/src/cuda/utils.h b/graphbolt/src/cuda/utils.h
@@ -16,6 +16,16 @@ constexpr int CUDA_MAX_NUM_THREADS = 1024;
 namespace graphbolt {
 namespace cuda {
 
+/**
+ * @brief Returns the compute capability of the cuda device, e.g. 70 for Volta.
+ */
+inline int compute_capability(
+    int device = cuda::GetCurrentStream().device_index()) {
+  int sm_version;
+  CUDA_RUNTIME_CHECK(cub::SmVersion(sm_version, device));
+  return sm_version / 10;
+};
+
 /**
  * @brief Calculate the number of threads needed given the size of the dimension
  * to be processed.