diff --git a/graphbolt/src/cuda/unique_and_compact_impl.cu b/graphbolt/src/cuda/unique_and_compact_impl.cu index 855bb2f67325..71ecfe3f553f 100644 --- a/graphbolt/src/cuda/unique_and_compact_impl.cu +++ b/graphbolt/src/cuda/unique_and_compact_impl.cu @@ -277,22 +277,7 @@ UniqueAndCompactBatched( const std::vector& src_ids, const std::vector& dst_ids, const std::vector& unique_dst_ids, int num_bits) { - auto dev_id = cuda::GetCurrentStream().device_index(); - static std::mutex mtx; - static std::unordered_map compute_capability_cache; - const auto compute_capability_major = [&] { - std::lock_guard lock(mtx); - auto it = compute_capability_cache.find(dev_id); - if (it != compute_capability_cache.end()) { - return it->second; - } else { - int major; - CUDA_RUNTIME_CHECK(cudaDeviceGetAttribute( - &major, cudaDevAttrComputeCapabilityMajor, dev_id)); - return compute_capability_cache[dev_id] = major; - } - }(); - if (compute_capability_major >= 7) { + if (cuda::compute_capability() >= 70) { // Utilizes a hash table based implementation, the mapped id of a vertex // will be monotonically increasing as the first occurrence index of it in // torch.cat([unique_dst_ids, src_ids]). Thus, it is deterministic. diff --git a/graphbolt/src/cuda/utils.h b/graphbolt/src/cuda/utils.h index a49a9c1880c9..05f5ffbb2c8c 100644 --- a/graphbolt/src/cuda/utils.h +++ b/graphbolt/src/cuda/utils.h @@ -16,6 +16,16 @@ constexpr int CUDA_MAX_NUM_THREADS = 1024; namespace graphbolt { namespace cuda { +/** + * @brief Returns the compute capability of the cuda device, e.g. 70 for Volta. + */ +inline int compute_capability( + int device = cuda::GetCurrentStream().device_index()) { + int sm_version; + CUDA_RUNTIME_CHECK(cub::SmVersion(sm_version, device)); + return sm_version / 10; +}; + /** * @brief Calculate the number of threads needed given the size of the dimension * to be processed.