From f3697ee5b0267f5c7fb8b39715c77ae319bd8aee Mon Sep 17 00:00:00 2001
From: nv-dlasalle <63612878+nv-dlasalle@users.noreply.github.com>
Date: Wed, 17 May 2023 16:18:35 -0700
Subject: [PATCH] [Performance Improvement] Make GPU sampling and to_block use
 pinned memory to decrease required synchronization (#5685)

---
 src/array/cuda/rowwise_sampling.cu        | 27 ++++++++++-----
 src/graph/transform/cuda/cuda_to_block.cu | 42 ++++++++++++++++++-----
 2 files changed, 52 insertions(+), 17 deletions(-)
diff --git a/src/array/cuda/rowwise_sampling.cu b/src/array/cuda/rowwise_sampling.cu
index 329685b8e814..d04bc3673d30 100644
--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
@@ -7,6 +7,7 @@
 #include <curand_kernel.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
+#include <dgl/runtime/tensordispatch.h>
 
 #include <numeric>
 
@@ -15,9 +16,11 @@
 #include "./dgl_cub.cuh"
 #include "./utils.h"
 
+using namespace dgl::cuda;
+using namespace dgl::aten::cuda;
+using TensorDispatcher = dgl::runtime::TensorDispatcher;
+
 namespace dgl {
-using namespace cuda;
-using namespace aten::cuda;
 namespace aten {
 namespace impl {
 
@@ -287,13 +290,20 @@ COOMatrix _CSRRowWiseSamplingUniform(
   cudaEvent_t copyEvent;
   CUDA_CALL(cudaEventCreate(&copyEvent));
 
-  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
-  // wait on a cudaevent
-  IdType new_len;
+  NDArray new_len_tensor;
+  if (TensorDispatcher::Global()->IsAvailable()) {
+    new_len_tensor = NDArray::PinnedEmpty(
+        {1}, DGLDataTypeTraits<IdType>::dtype, DGLContext{kDGLCPU, 0});
+  } else {
+    // use pageable memory, it will unecessarily block but be functional
+    new_len_tensor = NDArray::Empty(
+        {1}, DGLDataTypeTraits<IdType>::dtype, DGLContext{kDGLCPU, 0});
+  }
+
   // copy using the internal current stream
-  device->CopyDataFromTo(
-      out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
-      DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
+  CUDA_CALL(cudaMemcpyAsync(
+      new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
+      cudaMemcpyDeviceToHost, stream));
   CUDA_CALL(cudaEventRecord(copyEvent, stream));
 
   const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
@@ -322,6 +332,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
   CUDA_CALL(cudaEventSynchronize(copyEvent));
   CUDA_CALL(cudaEventDestroy(copyEvent));
 
+  const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
   picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
   picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
   picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);
diff --git a/src/graph/transform/cuda/cuda_to_block.cu b/src/graph/transform/cuda/cuda_to_block.cu
index f81c8713b910..a8bffc8cc6e8 100644
--- a/src/graph/transform/cuda/cuda_to_block.cu
+++ b/src/graph/transform/cuda/cuda_to_block.cu
@@ -23,6 +23,7 @@
 #include <cuda_runtime.h>
 #include <dgl/immutable_graph.h>
 #include <dgl/runtime/device_api.h>
+#include <dgl/runtime/tensordispatch.h>
 
 #include <algorithm>
 #include <memory>
@@ -36,6 +37,7 @@
 using namespace dgl::aten;
 using namespace dgl::runtime::cuda;
 using namespace dgl::transform::cuda;
+using TensorDispatcher = dgl::runtime::TensorDispatcher;
 
 namespace dgl {
 namespace transform {
@@ -165,6 +167,9 @@ struct CUDAIdsMapper {
             NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8));
       }
     }
+
+    cudaEvent_t copyEvent;
+    NDArray new_len_tensor;
     // Populate the mappings.
     if (generate_lhs_nodes) {
       int64_t* count_lhs_device = static_cast<int64_t*>(
@@ -174,13 +179,23 @@ struct CUDAIdsMapper {
           src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
           stream);
 
-      device->CopyDataFromTo(
-          count_lhs_device, 0, num_nodes_per_type.data(), 0,
-          sizeof(*num_nodes_per_type.data()) * num_ntypes, ctx,
-          DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1});
-      device->StreamSync(ctx, stream);
+      CUDA_CALL(cudaEventCreate(&copyEvent));
+      if (TensorDispatcher::Global()->IsAvailable()) {
+        new_len_tensor = NDArray::PinnedEmpty(
+            {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
+            DGLContext{kDGLCPU, 0});
+      } else {
+        // use pageable memory, it will unecessarily block but be functional
+        new_len_tensor = NDArray::Empty(
+            {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
+            DGLContext{kDGLCPU, 0});
+      }
+      CUDA_CALL(cudaMemcpyAsync(
+          new_len_tensor->data, count_lhs_device,
+          sizeof(*num_nodes_per_type.data()) * num_ntypes,
+          cudaMemcpyDeviceToHost, stream));
+      CUDA_CALL(cudaEventRecord(copyEvent, stream));
 
-      // Wait for the node counts to finish transferring.
       device->FreeWorkspace(ctx, count_lhs_device);
     } else {
       maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream);
@@ -189,14 +204,23 @@ struct CUDAIdsMapper {
         num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0];
       }
     }
-    // Resize lhs nodes.
+    // Map node numberings from global to local, and build pointer for CSR.
+    auto ret = MapEdges(graph, edge_arrays, node_maps, stream);
+
     if (generate_lhs_nodes) {
+      // wait for the previous copy
+      CUDA_CALL(cudaEventSynchronize(copyEvent));
+      CUDA_CALL(cudaEventDestroy(copyEvent));
+
+      // Resize lhs nodes.
       for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+        num_nodes_per_type[ntype] =
+            static_cast<int64_t*>(new_len_tensor->data)[ntype];
         lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype];
       }
     }
-    // Map node numberings from global to local, and build pointer for CSR.
-    return MapEdges(graph, edge_arrays, node_maps, stream);
+
+    return ret;
   }
 };