diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h index c840842fce32..69694d8057d5 100644 --- a/graphbolt/include/graphbolt/cuda_ops.h +++ b/graphbolt/include/graphbolt/cuda_ops.h @@ -228,6 +228,25 @@ torch::Tensor ExpandIndptrImpl( torch::optional node_ids = torch::nullopt, torch::optional output_size = torch::nullopt); +/** + * @brief IndptrEdgeIdsImpl implements conversion from a given indptr offset + * tensor to a COO edge ids tensor. For a given indptr [0, 2, 5, 7] and offset + * tensor [0, 100, 200], the output will be [0, 1, 100, 101, 102, 201, 202]. If + * offset was not provided, the output would be [0, 1, 0, 1, 2, 0, 1]. + * + * @param indptr The indptr offset tensor. + * @param dtype The dtype of the returned output tensor. + * @param offset The offset tensor. + * @param output_size Optional value of indptr[-1]. Passing it eliminates CPU + * GPU synchronization. + * + * @return The resulting tensor. + */ +torch::Tensor IndptrEdgeIdsImpl( + torch::Tensor indptr, torch::ScalarType dtype, + torch::optional offset, + torch::optional output_size); + /** * @brief Removes duplicate elements from the concatenated 'unique_dst_ids' and * 'src_ids' tensor and applies the uniqueness information to compact both diff --git a/graphbolt/src/cuda/expand_indptr.cu b/graphbolt/src/cuda/expand_indptr.cu index df57dcbf4f16..71c64584057d 100644 --- a/graphbolt/src/cuda/expand_indptr.cu +++ b/graphbolt/src/cuda/expand_indptr.cu @@ -37,6 +37,14 @@ struct RepeatIndex { } }; +template +struct IotaIndex { + const nodes_t* nodes; + __host__ __device__ auto operator()(indices_t i) { + return thrust::make_counting_iterator(nodes ? nodes[i] : 0); + } +}; + template struct OutputBufferIndexer { const indptr_t* indptr; @@ -54,8 +62,8 @@ struct AdjacentDifference { torch::Tensor ExpandIndptrImpl( torch::Tensor indptr, torch::ScalarType dtype, - torch::optional nodes, - torch::optional output_size) { + torch::optional nodes, torch::optional output_size, + const bool edge_ids) { if (!output_size.has_value()) { output_size = AT_DISPATCH_INTEGRAL_TYPES( indptr.scalar_type(), "ExpandIndptrIndptr[-1]", ([&]() -> int64_t { @@ -84,8 +92,6 @@ torch::Tensor ExpandIndptrImpl( nodes ? nodes.value().data_ptr() : nullptr; thrust::counting_iterator iota(0); - auto input_buffer = thrust::make_transform_iterator( - iota, RepeatIndex{nodes_ptr}); auto output_buffer = thrust::make_transform_iterator( iota, OutputBufferIndexer{ indptr_ptr, csc_rows_ptr}); @@ -95,11 +101,25 @@ torch::Tensor ExpandIndptrImpl( const auto num_rows = indptr.size(0) - 1; constexpr int64_t max_copy_at_once = std::numeric_limits::max(); - for (int64_t i = 0; i < num_rows; i += max_copy_at_once) { - CUB_CALL( - DeviceCopy::Batched, input_buffer + i, - output_buffer + i, buffer_sizes + i, - std::min(num_rows - i, max_copy_at_once)); + + if (edge_ids) { + auto input_buffer = thrust::make_transform_iterator( + iota, IotaIndex{nodes_ptr}); + for (int64_t i = 0; i < num_rows; i += max_copy_at_once) { + CUB_CALL( + DeviceCopy::Batched, input_buffer + i, + output_buffer + i, buffer_sizes + i, + std::min(num_rows - i, max_copy_at_once)); + } + } else { + auto input_buffer = thrust::make_transform_iterator( + iota, RepeatIndex{nodes_ptr}); + for (int64_t i = 0; i < num_rows; i += max_copy_at_once) { + CUB_CALL( + DeviceCopy::Batched, input_buffer + i, + output_buffer + i, buffer_sizes + i, + std::min(num_rows - i, max_copy_at_once)); + } } })); })); @@ -107,5 +127,19 @@ torch::Tensor ExpandIndptrImpl( return csc_rows; } +torch::Tensor ExpandIndptrImpl( + torch::Tensor indptr, torch::ScalarType dtype, + torch::optional nodes, + torch::optional output_size) { + return ExpandIndptrImpl(indptr, dtype, nodes, output_size, false); +} + +torch::Tensor IndptrEdgeIdsImpl( + torch::Tensor indptr, torch::ScalarType dtype, + torch::optional offset, + torch::optional output_size) { + return ExpandIndptrImpl(indptr, dtype, offset, output_size, true); +} + } // namespace ops } // namespace graphbolt diff --git a/graphbolt/src/cuda/insubgraph.cu b/graphbolt/src/cuda/insubgraph.cu index 50ffd34b58b8..224d1407d6b0 100644 --- a/graphbolt/src/cuda/insubgraph.cu +++ b/graphbolt/src/cuda/insubgraph.cu @@ -39,11 +39,8 @@ c10::intrusive_ptr InSubgraph( in_degree, sliced_indptr, type_per_edge.value(), nodes, indptr.size(0) - 2, num_edges)); } - auto rows = ExpandIndptrImpl( - output_indptr, indices.scalar_type(), torch::nullopt, num_edges); - auto i = torch::arange(output_indices.size(0), output_indptr.options()); - auto edge_ids = - i - output_indptr.gather(0, rows) + sliced_indptr.gather(0, rows); + auto edge_ids = IndptrEdgeIdsImpl( + output_indptr, sliced_indptr.scalar_type(), sliced_indptr, num_edges); return c10::make_intrusive( output_indptr, output_indices, nodes, torch::nullopt, edge_ids, diff --git a/graphbolt/src/expand_indptr.cc b/graphbolt/src/expand_indptr.cc index 591973f5182a..d9ab41f6c3b5 100644 --- a/graphbolt/src/expand_indptr.cc +++ b/graphbolt/src/expand_indptr.cc @@ -30,6 +30,19 @@ torch::Tensor ExpandIndptr( indptr.diff(), 0, output_size); } +torch::Tensor IndptrEdgeIds( + torch::Tensor indptr, torch::ScalarType dtype, + torch::optional offset, + torch::optional output_size) { + if (utils::is_on_gpu(indptr) && + (!offset.has_value() || utils::is_on_gpu(offset.value()))) { + GRAPHBOLT_DISPATCH_CUDA_ONLY_DEVICE( + c10::DeviceType::CUDA, "IndptrEdgeIds", + { return IndptrEdgeIdsImpl(indptr, dtype, offset, output_size); }); + } + TORCH_CHECK(false, "CPU implementation of IndptrEdgeIds is not available."); +} + TORCH_LIBRARY_IMPL(graphbolt, CPU, m) { m.impl("expand_indptr", &ExpandIndptr); } @@ -44,5 +57,19 @@ TORCH_LIBRARY_IMPL(graphbolt, Autograd, m) { m.impl("expand_indptr", torch::autograd::autogradNotImplementedFallback()); } +TORCH_LIBRARY_IMPL(graphbolt, CPU, m) { + m.impl("indptr_edge_ids", &IndptrEdgeIds); +} + +#ifdef GRAPHBOLT_USE_CUDA +TORCH_LIBRARY_IMPL(graphbolt, CUDA, m) { + m.impl("indptr_edge_ids", &IndptrEdgeIdsImpl); +} +#endif + +TORCH_LIBRARY_IMPL(graphbolt, Autograd, m) { + m.impl("indptr_edge_ids", torch::autograd::autogradNotImplementedFallback()); +} + } // namespace ops } // namespace graphbolt diff --git a/graphbolt/src/expand_indptr.h b/graphbolt/src/expand_indptr.h index 99effa7b8857..9a20cceb4839 100644 --- a/graphbolt/src/expand_indptr.h +++ b/graphbolt/src/expand_indptr.h @@ -30,6 +30,25 @@ torch::Tensor ExpandIndptr( torch::optional node_ids = torch::nullopt, torch::optional output_size = torch::nullopt); +/** + * @brief IndptrEdgeIdsImpl implements conversion from a given indptr offset + * tensor to a COO edge ids tensor. For a given indptr [0, 2, 5, 7] and offset + * tensor [0, 100, 200], the output will be [0, 1, 100, 101, 102, 201, 202]. If + * offset was not provided, the output would be [0, 1, 0, 1, 2, 0, 1]. + * + * @param indptr The indptr offset tensor. + * @param dtype The dtype of the returned output tensor. + * @param offset The offset tensor. + * @param output_size Optional value of indptr[-1]. Passing it eliminates CPU + * GPU synchronization. + * + * @return The resulting tensor. + */ +torch::Tensor IndptrEdgeIds( + torch::Tensor indptr, torch::ScalarType dtype, + torch::optional offset, + torch::optional output_size); + } // namespace ops } // namespace graphbolt diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc index ca62fab419ba..c1125a4dfd4a 100644 --- a/graphbolt/src/python_binding.cc +++ b/graphbolt/src/python_binding.cc @@ -167,6 +167,15 @@ TORCH_LIBRARY(graphbolt, m) { #ifdef HAS_PT2_COMPLIANT_TAG , {at::Tag::pt2_compliant_tag} +#endif + ); + m.def( + "indptr_edge_ids(Tensor indptr, ScalarType dtype, Tensor? offset, " + "SymInt? output_size) -> " + "Tensor" +#ifdef HAS_PT2_COMPLIANT_TAG + , + {at::Tag::pt2_compliant_tag} #endif ); }