From f3697ee5b0267f5c7fb8b39715c77ae319bd8aee Mon Sep 17 00:00:00 2001 From: nv-dlasalle <63612878+nv-dlasalle@users.noreply.github.com> Date: Wed, 17 May 2023 16:18:35 -0700 Subject: [PATCH] [Performance Improvement] Make GPU sampling and to_block use pinned memory to decrease required synchronization (#5685) --- src/array/cuda/rowwise_sampling.cu | 27 ++++++++++----- src/graph/transform/cuda/cuda_to_block.cu | 42 ++++++++++++++++++----- 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/src/array/cuda/rowwise_sampling.cu b/src/array/cuda/rowwise_sampling.cu index 329685b8e814..d04bc3673d30 100644 --- a/src/array/cuda/rowwise_sampling.cu +++ b/src/array/cuda/rowwise_sampling.cu @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -15,9 +16,11 @@ #include "./dgl_cub.cuh" #include "./utils.h" +using namespace dgl::cuda; +using namespace dgl::aten::cuda; +using TensorDispatcher = dgl::runtime::TensorDispatcher; + namespace dgl { -using namespace cuda; -using namespace aten::cuda; namespace aten { namespace impl { @@ -287,13 +290,20 @@ COOMatrix _CSRRowWiseSamplingUniform( cudaEvent_t copyEvent; CUDA_CALL(cudaEventCreate(©Event)); - // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and - // wait on a cudaevent - IdType new_len; + NDArray new_len_tensor; + if (TensorDispatcher::Global()->IsAvailable()) { + new_len_tensor = NDArray::PinnedEmpty( + {1}, DGLDataTypeTraits::dtype, DGLContext{kDGLCPU, 0}); + } else { + // use pageable memory, it will unecessarily block but be functional + new_len_tensor = NDArray::Empty( + {1}, DGLDataTypeTraits::dtype, DGLContext{kDGLCPU, 0}); + } + // copy using the internal current stream - device->CopyDataFromTo( - out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx, - DGLContext{kDGLCPU, 0}, mat.indptr->dtype); + CUDA_CALL(cudaMemcpyAsync( + new_len_tensor->data, out_ptr + num_rows, sizeof(IdType), + cudaMemcpyDeviceToHost, stream)); CUDA_CALL(cudaEventRecord(copyEvent, stream)); const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); @@ -322,6 +332,7 @@ COOMatrix _CSRRowWiseSamplingUniform( CUDA_CALL(cudaEventSynchronize(copyEvent)); CUDA_CALL(cudaEventDestroy(copyEvent)); + const IdType new_len = static_cast(new_len_tensor->data)[0]; picked_row = picked_row.CreateView({new_len}, picked_row->dtype); picked_col = picked_col.CreateView({new_len}, picked_col->dtype); picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype); diff --git a/src/graph/transform/cuda/cuda_to_block.cu b/src/graph/transform/cuda/cuda_to_block.cu index f81c8713b910..a8bffc8cc6e8 100644 --- a/src/graph/transform/cuda/cuda_to_block.cu +++ b/src/graph/transform/cuda/cuda_to_block.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -36,6 +37,7 @@ using namespace dgl::aten; using namespace dgl::runtime::cuda; using namespace dgl::transform::cuda; +using TensorDispatcher = dgl::runtime::TensorDispatcher; namespace dgl { namespace transform { @@ -165,6 +167,9 @@ struct CUDAIdsMapper { NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8)); } } + + cudaEvent_t copyEvent; + NDArray new_len_tensor; // Populate the mappings. if (generate_lhs_nodes) { int64_t* count_lhs_device = static_cast( @@ -174,13 +179,23 @@ struct CUDAIdsMapper { src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes, stream); - device->CopyDataFromTo( - count_lhs_device, 0, num_nodes_per_type.data(), 0, - sizeof(*num_nodes_per_type.data()) * num_ntypes, ctx, - DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1}); - device->StreamSync(ctx, stream); + CUDA_CALL(cudaEventCreate(©Event)); + if (TensorDispatcher::Global()->IsAvailable()) { + new_len_tensor = NDArray::PinnedEmpty( + {num_ntypes}, DGLDataTypeTraits::dtype, + DGLContext{kDGLCPU, 0}); + } else { + // use pageable memory, it will unecessarily block but be functional + new_len_tensor = NDArray::Empty( + {num_ntypes}, DGLDataTypeTraits::dtype, + DGLContext{kDGLCPU, 0}); + } + CUDA_CALL(cudaMemcpyAsync( + new_len_tensor->data, count_lhs_device, + sizeof(*num_nodes_per_type.data()) * num_ntypes, + cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaEventRecord(copyEvent, stream)); - // Wait for the node counts to finish transferring. device->FreeWorkspace(ctx, count_lhs_device); } else { maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream); @@ -189,14 +204,23 @@ struct CUDAIdsMapper { num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0]; } } - // Resize lhs nodes. + // Map node numberings from global to local, and build pointer for CSR. + auto ret = MapEdges(graph, edge_arrays, node_maps, stream); + if (generate_lhs_nodes) { + // wait for the previous copy + CUDA_CALL(cudaEventSynchronize(copyEvent)); + CUDA_CALL(cudaEventDestroy(copyEvent)); + + // Resize lhs nodes. for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + num_nodes_per_type[ntype] = + static_cast(new_len_tensor->data)[ntype]; lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype]; } } - // Map node numberings from global to local, and build pointer for CSR. - return MapEdges(graph, edge_arrays, node_maps, stream); + + return ret; } };