diff --git a/graphbolt/include/graphbolt/cuda_ops.h b/graphbolt/include/graphbolt/cuda_ops.h
index c840842fce32..69694d8057d5 100644
--- a/graphbolt/include/graphbolt/cuda_ops.h
+++ b/graphbolt/include/graphbolt/cuda_ops.h
@@ -228,6 +228,25 @@ torch::Tensor ExpandIndptrImpl(
     torch::optional<torch::Tensor> node_ids = torch::nullopt,
     torch::optional<int64_t> output_size = torch::nullopt);
 
+/**
+ * @brief IndptrEdgeIdsImpl implements conversion from a given indptr offset
+ * tensor to a COO edge ids tensor. For a given indptr [0, 2, 5, 7] and offset
+ * tensor [0, 100, 200], the output will be [0, 1, 100, 101, 102, 201, 202]. If
+ * offset was not provided, the output would be [0, 1, 0, 1, 2, 0, 1].
+ *
+ * @param indptr       The indptr offset tensor.
+ * @param dtype        The dtype of the returned output tensor.
+ * @param offset       The offset tensor.
+ * @param output_size  Optional value of indptr[-1]. Passing it eliminates CPU
+ * GPU synchronization.
+ *
+ * @return The resulting tensor.
+ */
+torch::Tensor IndptrEdgeIdsImpl(
+    torch::Tensor indptr, torch::ScalarType dtype,
+    torch::optional<torch::Tensor> offset,
+    torch::optional<int64_t> output_size);
+
 /**
  * @brief Removes duplicate elements from the concatenated 'unique_dst_ids' and
  * 'src_ids' tensor and applies the uniqueness information to compact both
diff --git a/graphbolt/src/cuda/expand_indptr.cu b/graphbolt/src/cuda/expand_indptr.cu
index df57dcbf4f16..71c64584057d 100644
--- a/graphbolt/src/cuda/expand_indptr.cu
+++ b/graphbolt/src/cuda/expand_indptr.cu
@@ -37,6 +37,14 @@ struct RepeatIndex {
   }
 };
 
+template <typename indices_t, typename nodes_t>
+struct IotaIndex {
+  const nodes_t* nodes;
+  __host__ __device__ auto operator()(indices_t i) {
+    return thrust::make_counting_iterator(nodes ? nodes[i] : 0);
+  }
+};
+
 template <typename indptr_t, typename indices_t>
 struct OutputBufferIndexer {
   const indptr_t* indptr;
@@ -54,8 +62,8 @@ struct AdjacentDifference {
 
 torch::Tensor ExpandIndptrImpl(
     torch::Tensor indptr, torch::ScalarType dtype,
-    torch::optional<torch::Tensor> nodes,
-    torch::optional<int64_t> output_size) {
+    torch::optional<torch::Tensor> nodes, torch::optional<int64_t> output_size,
+    const bool edge_ids) {
   if (!output_size.has_value()) {
     output_size = AT_DISPATCH_INTEGRAL_TYPES(
         indptr.scalar_type(), "ExpandIndptrIndptr[-1]", ([&]() -> int64_t {
@@ -84,8 +92,6 @@ torch::Tensor ExpandIndptrImpl(
                         nodes ? nodes.value().data_ptr<nodes_t>() : nullptr;
 
                     thrust::counting_iterator<int64_t> iota(0);
-                    auto input_buffer = thrust::make_transform_iterator(
-                        iota, RepeatIndex<indices_t, nodes_t>{nodes_ptr});
                     auto output_buffer = thrust::make_transform_iterator(
                         iota, OutputBufferIndexer<indptr_t, indices_t>{
                                   indptr_ptr, csc_rows_ptr});
@@ -95,11 +101,25 @@ torch::Tensor ExpandIndptrImpl(
                     const auto num_rows = indptr.size(0) - 1;
                     constexpr int64_t max_copy_at_once =
                         std::numeric_limits<int32_t>::max();
-                    for (int64_t i = 0; i < num_rows; i += max_copy_at_once) {
-                      CUB_CALL(
-                          DeviceCopy::Batched, input_buffer + i,
-                          output_buffer + i, buffer_sizes + i,
-                          std::min(num_rows - i, max_copy_at_once));
+
+                    if (edge_ids) {
+                      auto input_buffer = thrust::make_transform_iterator(
+                          iota, IotaIndex<indices_t, nodes_t>{nodes_ptr});
+                      for (int64_t i = 0; i < num_rows; i += max_copy_at_once) {
+                        CUB_CALL(
+                            DeviceCopy::Batched, input_buffer + i,
+                            output_buffer + i, buffer_sizes + i,
+                            std::min(num_rows - i, max_copy_at_once));
+                      }
+                    } else {
+                      auto input_buffer = thrust::make_transform_iterator(
+                          iota, RepeatIndex<indices_t, nodes_t>{nodes_ptr});
+                      for (int64_t i = 0; i < num_rows; i += max_copy_at_once) {
+                        CUB_CALL(
+                            DeviceCopy::Batched, input_buffer + i,
+                            output_buffer + i, buffer_sizes + i,
+                            std::min(num_rows - i, max_copy_at_once));
+                      }
                     }
                   }));
             }));
@@ -107,5 +127,19 @@ torch::Tensor ExpandIndptrImpl(
   return csc_rows;
 }
 
+torch::Tensor ExpandIndptrImpl(
+    torch::Tensor indptr, torch::ScalarType dtype,
+    torch::optional<torch::Tensor> nodes,
+    torch::optional<int64_t> output_size) {
+  return ExpandIndptrImpl(indptr, dtype, nodes, output_size, false);
+}
+
+torch::Tensor IndptrEdgeIdsImpl(
+    torch::Tensor indptr, torch::ScalarType dtype,
+    torch::optional<torch::Tensor> offset,
+    torch::optional<int64_t> output_size) {
+  return ExpandIndptrImpl(indptr, dtype, offset, output_size, true);
+}
+
 }  // namespace ops
 }  // namespace graphbolt
diff --git a/graphbolt/src/cuda/insubgraph.cu b/graphbolt/src/cuda/insubgraph.cu
index 50ffd34b58b8..224d1407d6b0 100644
--- a/graphbolt/src/cuda/insubgraph.cu
+++ b/graphbolt/src/cuda/insubgraph.cu
@@ -39,11 +39,8 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> InSubgraph(
         in_degree, sliced_indptr, type_per_edge.value(), nodes,
         indptr.size(0) - 2, num_edges));
   }
-  auto rows = ExpandIndptrImpl(
-      output_indptr, indices.scalar_type(), torch::nullopt, num_edges);
-  auto i = torch::arange(output_indices.size(0), output_indptr.options());
-  auto edge_ids =
-      i - output_indptr.gather(0, rows) + sliced_indptr.gather(0, rows);
+  auto edge_ids = IndptrEdgeIdsImpl(
+      output_indptr, sliced_indptr.scalar_type(), sliced_indptr, num_edges);
 
   return c10::make_intrusive<sampling::FusedSampledSubgraph>(
       output_indptr, output_indices, nodes, torch::nullopt, edge_ids,
diff --git a/graphbolt/src/expand_indptr.cc b/graphbolt/src/expand_indptr.cc
index 591973f5182a..d9ab41f6c3b5 100644
--- a/graphbolt/src/expand_indptr.cc
+++ b/graphbolt/src/expand_indptr.cc
@@ -30,6 +30,19 @@ torch::Tensor ExpandIndptr(
       indptr.diff(), 0, output_size);
 }
 
+torch::Tensor IndptrEdgeIds(
+    torch::Tensor indptr, torch::ScalarType dtype,
+    torch::optional<torch::Tensor> offset,
+    torch::optional<int64_t> output_size) {
+  if (utils::is_on_gpu(indptr) &&
+      (!offset.has_value() || utils::is_on_gpu(offset.value()))) {
+    GRAPHBOLT_DISPATCH_CUDA_ONLY_DEVICE(
+        c10::DeviceType::CUDA, "IndptrEdgeIds",
+        { return IndptrEdgeIdsImpl(indptr, dtype, offset, output_size); });
+  }
+  TORCH_CHECK(false, "CPU implementation of IndptrEdgeIds is not available.");
+}
+
 TORCH_LIBRARY_IMPL(graphbolt, CPU, m) {
   m.impl("expand_indptr", &ExpandIndptr);
 }
@@ -44,5 +57,19 @@ TORCH_LIBRARY_IMPL(graphbolt, Autograd, m) {
   m.impl("expand_indptr", torch::autograd::autogradNotImplementedFallback());
 }
 
+TORCH_LIBRARY_IMPL(graphbolt, CPU, m) {
+  m.impl("indptr_edge_ids", &IndptrEdgeIds);
+}
+
+#ifdef GRAPHBOLT_USE_CUDA
+TORCH_LIBRARY_IMPL(graphbolt, CUDA, m) {
+  m.impl("indptr_edge_ids", &IndptrEdgeIdsImpl);
+}
+#endif
+
+TORCH_LIBRARY_IMPL(graphbolt, Autograd, m) {
+  m.impl("indptr_edge_ids", torch::autograd::autogradNotImplementedFallback());
+}
+
 }  // namespace ops
 }  // namespace graphbolt
diff --git a/graphbolt/src/expand_indptr.h b/graphbolt/src/expand_indptr.h
index 99effa7b8857..9a20cceb4839 100644
--- a/graphbolt/src/expand_indptr.h
+++ b/graphbolt/src/expand_indptr.h
@@ -30,6 +30,25 @@ torch::Tensor ExpandIndptr(
     torch::optional<torch::Tensor> node_ids = torch::nullopt,
     torch::optional<int64_t> output_size = torch::nullopt);
 
+/**
+ * @brief IndptrEdgeIdsImpl implements conversion from a given indptr offset
+ * tensor to a COO edge ids tensor. For a given indptr [0, 2, 5, 7] and offset
+ * tensor [0, 100, 200], the output will be [0, 1, 100, 101, 102, 201, 202]. If
+ * offset was not provided, the output would be [0, 1, 0, 1, 2, 0, 1].
+ *
+ * @param indptr       The indptr offset tensor.
+ * @param dtype        The dtype of the returned output tensor.
+ * @param offset       The offset tensor.
+ * @param output_size  Optional value of indptr[-1]. Passing it eliminates CPU
+ * GPU synchronization.
+ *
+ * @return The resulting tensor.
+ */
+torch::Tensor IndptrEdgeIds(
+    torch::Tensor indptr, torch::ScalarType dtype,
+    torch::optional<torch::Tensor> offset,
+    torch::optional<int64_t> output_size);
+
 }  // namespace ops
 }  // namespace graphbolt
 
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index ca62fab419ba..c1125a4dfd4a 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -167,6 +167,15 @@ TORCH_LIBRARY(graphbolt, m) {
 #ifdef HAS_PT2_COMPLIANT_TAG
       ,
       {at::Tag::pt2_compliant_tag}
+#endif
+  );
+  m.def(
+      "indptr_edge_ids(Tensor indptr, ScalarType dtype, Tensor? offset, "
+      "SymInt? output_size) -> "
+      "Tensor"
+#ifdef HAS_PT2_COMPLIANT_TAG
+      ,
+      {at::Tag::pt2_compliant_tag}
 #endif
   );
 }