Made the CUDA sanity checks work with SoA containers.

acts-project · Sep 21, 2024 · 7fe10dc · 7fe10dc
1 parent 28e8247
commit 7fe10dc
Show file tree

Hide file tree

Showing 6 changed files with 139 additions and 100 deletions.
diff --git a/device/cuda/src/clusterization/clusterization_algorithm.cu b/device/cuda/src/clusterization/clusterization_algorithm.cu
@@ -92,11 +92,6 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
     const edm::silicon_cell_collection::const_view& cells,
     const silicon_detector_description::const_view& det_descr) const {
 
-    // assert(is_contiguous_on(cell_module_projection(), m_mr.main, m_copy,
-    //                         m_stream, cells));
-    // assert(is_ordered_on(channel0_major_cell_order_relation(), m_mr.main,
-    //                      m_copy, m_stream, cells));
-
     // Get a convenience variable for the stream that we'll be using.
     cudaStream_t stream = details::get_stream(m_stream);
 
@@ -114,6 +109,12 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
         return measurements;
     }
 
+    assert(is_contiguous_on<edm::silicon_cell_collection::const_device>(
+        cell_module_projection(), m_mr.main, m_copy, m_stream, cells));
+    assert(is_ordered_on<edm::silicon_cell_collection::const_device>(
+        channel0_major_cell_order_relation(), m_mr.main, m_copy, m_stream,
+        cells));
+
     // Create buffer for linking cells to their measurements.
     //
     // @todo Construct cell clusters on demand in a member function for

diff --git a/device/cuda/src/finding/finding_algorithm.cu b/device/cuda/src/finding/finding_algorithm.cu
@@ -241,8 +241,9 @@ finding_algorithm<stepper_t, navigator_t>::operator()(
     measurement_collection_types::const_view::size_type n_measurements =
         m_copy.get_size(measurements);
 
-    assert(is_contiguous_on(measurement_module_projection(), m_mr.main, m_copy,
-                            m_stream, measurements));
+    assert(is_contiguous_on<measurement_collection_types::const_device>(
+        measurement_module_projection(), m_mr.main, m_copy, m_stream,
+        measurements));
 
     // Get copy of barcode uniques
     measurement_collection_types::buffer uniques_buffer{n_measurements,

diff --git a/device/cuda/src/sanity/contiguous_on.cuh b/device/cuda/src/sanity/contiguous_on.cuh
@@ -10,11 +10,10 @@
 
 // Project include(s).
 #include "../utils/cuda_error_handling.hpp"
+#include "../utils/utils.hpp"
 #include "traccc/cuda/utils/stream.hpp"
 
 // VecMem include(s).
-#include <vecmem/containers/data/vector_view.hpp>
-#include <vecmem/containers/device_vector.hpp>
 #include <vecmem/memory/memory_resource.hpp>
 #include <vecmem/memory/unique_ptr.hpp>
 #include <vecmem/utils/copy.hpp>
@@ -27,23 +26,23 @@
 
 namespace traccc::cuda {
 namespace kernels {
-template <std::semiregular P, typename T, typename S>
-requires std::regular_invocable<P, T> __global__ void compress_adjacent(
-    P projection, vecmem::data::vector_view<T> _in, S* out,
-    uint32_t* out_size) {
+template <typename CONTAINER, std::semiregular P, typename VIEW, typename S>
+requires std::regular_invocable<P, CONTAINER, std::size_t> __global__ void
+compress_adjacent(P projection, VIEW _in, S* out, uint32_t* out_size) {
+
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-    vecmem::device_vector<T> in(_in);
+    CONTAINER in(_in);
 
     if (tid > 0 && tid < in.size()) {
-        std::invoke_result_t<P, T> v1 = projection(in.at(tid - 1));
-        std::invoke_result_t<P, T> v2 = projection(in.at(tid));
+        S v1 = projection(in, tid - 1);
+        S v2 = projection(in, tid);
 
         if (v1 != v2) {
             out[atomicAdd(out_size, 1u)] = v2;
         }
     } else if (tid == 0) {
-        out[atomicAdd(out_size, 1u)] = projection(in.at(tid));
+        out[atomicAdd(out_size, 1u)] = projection(in, tid);
     }
 }
 
@@ -59,40 +58,47 @@ __global__ void all_unique(const T* in, const size_t n, bool* out) {
 }  // namespace kernels
 
 /**
- * @brief Sanity check that a given vector is contiguous on a given projection.
+ * @brief Sanity check that a given container is contiguous on a given
+ *        projection.
  *
- * For a vector $v$ to be contiguous on a projection $\pi$, it must be the case
- * that for all indices $i$ and $j$, if $v_i = v_j$, then all indices $k$
+ * For a container $v$ to be contiguous on a projection $\pi$, it must be the
+ * case that for all indices $i$ and $j$, if $v_i = v_j$, then all indices $k$
  * between $i$ and $j$, $v_i = v_j = v_k$.
  *
  * @note This function runs in O(n^2) time.
  *
+ * @tparam CONTAINER The type of the (device) container.
  * @tparam P The type of projection $\pi$, a callable which returns some
  * comparable type.
- * @tparam T The type of the vector.
+ * @tparam VIEW The type of the view for the container.
  * @param projection A projection object of type `P`.
  * @param mr A memory resource used for allocating intermediate memory.
- * @param vector The vector which to check for contiguity.
- * @return true If the vector is contiguous on `P`.
+ * @param view The container which to check for contiguity.
+ * @return true If the container is contiguous on `P`.
  * @return false Otherwise.
  */
-template <std::semiregular P, std::equality_comparable T>
-requires std::regular_invocable<P, T> bool is_contiguous_on(
-    P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy,
-    stream& stream, vecmem::data::vector_view<T> vector) {
+template <typename CONTAINER, std::semiregular P, typename VIEW>
+requires std::regular_invocable<P, CONTAINER, std::size_t> bool
+is_contiguous_on(P&& projection, vecmem::memory_resource& mr,
+                 vecmem::copy& copy, stream& stream, const VIEW& view) {
+
     // This should never be a performance-critical step, so we can keep the
     // block size fixed.
     constexpr int block_size = 512;
     constexpr int block_size_2d = 32;
 
-    cudaStream_t cuda_stream =
-        reinterpret_cast<cudaStream_t>(stream.cudaStream());
+    cudaStream_t cuda_stream = details::get_stream(stream);
 
-    // Grab the number of elements in our vector.
-    uint32_t n = copy.get_size(vector);
+    // Grab the number of elements in our container.
+    std::size_t n = copy.get_size(view);
+
+    // Exit early for empty containers.
+    if (n == 0) {
+        return true;
+    }
 
     // Get the output type of the projection.
-    using projection_t = std::invoke_result_t<P, T>;
+    using projection_t = std::invoke_result_t<P, CONTAINER, std::size_t>;
 
     // Allocate memory for intermediate values and outputs, then set them up.
     vecmem::unique_alloc_ptr<projection_t[]> iout =
@@ -113,9 +119,9 @@ requires std::regular_invocable<P, T> bool is_contiguous_on(
 
     // Launch the first kernel, which will squash consecutive equal elements
     // into one element.
-    kernels::compress_adjacent<P, T, projection_t>
+    kernels::compress_adjacent<CONTAINER>
         <<<(n + block_size - 1) / block_size, block_size, 0, cuda_stream>>>(
-            projection, vector, iout.get(), iout_size.get());
+            projection, view, iout.get(), iout_size.get());
 
     TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
 
@@ -126,6 +132,7 @@ requires std::regular_invocable<P, T> bool is_contiguous_on(
     TRACCC_CUDA_ERROR_CHECK(
         cudaMemcpyAsync(&host_iout_size, iout_size.get(), sizeof(uint32_t),
                         cudaMemcpyDeviceToHost, cuda_stream));
+    stream.synchronize();
 
     // Launch the second kernel, which will check if the values are unique.
     uint32_t grid_size_rd =

diff --git a/device/cuda/src/sanity/ordered_on.cuh b/device/cuda/src/sanity/ordered_on.cuh
@@ -10,11 +10,10 @@
 
 // Project include(s).
 #include "../utils/cuda_error_handling.hpp"
+#include "../utils/utils.hpp"
 #include "traccc/cuda/utils/stream.hpp"
 
 // VecMem include(s).
-#include <vecmem/containers/data/vector_view.hpp>
-#include <vecmem/containers/device_vector.hpp>
 #include <vecmem/memory/memory_resource.hpp>
 #include <vecmem/memory/unique_ptr.hpp>
 #include <vecmem/utils/copy.hpp>
@@ -27,25 +26,25 @@
 
 namespace traccc::cuda {
 namespace kernels {
-template <std::semiregular R, typename T>
-requires std::relation<R, T, T> __global__ void is_ordered_on_kernel(
-    R relation, vecmem::data::vector_view<T> _in, bool* out) {
+template <typename CONTAINER, std::semiregular R, typename VIEW>
+requires std::regular_invocable<R, CONTAINER, std::size_t, std::size_t>
+    __global__ void is_ordered_on_kernel(R relation, VIEW _in, bool* out) {
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-    vecmem::device_vector<T> in(_in);
+    CONTAINER in(_in);
 
     if (tid > 0 && tid < in.size()) {
-        if (!relation(in.at(tid - 1), in.at(tid))) {
+        if (!relation(in, tid - 1, tid)) {
             *out = false;
         }
     }
 }
 }  // namespace kernels
 
 /**
- * @brief Sanity check that a given vector is ordered on a given relation.
+ * @brief Sanity check that a given container is ordered on a given relation.
  *
- * For a vector $v$ to be ordered on a relation $R$, it must be the case that
+ * For a container $v$ to be ordered on a relation $R$, it must be the case that
  * for all indices $i$ and $j$, if $i < j$, then $R(i, j)$.
  *
  * @note This function runs in O(n) time.
@@ -56,28 +55,34 @@ requires std::relation<R, T, T> __global__ void is_ordered_on_kernel(
  *
  * @note For any strict weak order $R$, `is_ordered_on(sort(R, v))` is true.
  *
+ * @tparam CONTAINER The type of the (device) container.
  * @tparam R The type of relation $R$, a callable which returns a bool if the
  * first argument can be immediately before the second type.
- * @tparam T The type of the vector.
+ * @tparam VIEW The type of the view for the container.
  * @param relation A relation object of type `R`.
  * @param mr A memory resource used for allocating intermediate memory.
- * @param vector The vector which to check for ordering.
- * @return true If the vector is ordered on `R`.
+ * @param view The container which to check for ordering.
+ * @return true If the container is ordered on `R`.
  * @return false Otherwise.
  */
-template <std::semiregular R, typename T>
-requires std::relation<R, T, T> bool is_ordered_on(
-    R relation, vecmem::memory_resource& mr, vecmem::copy& copy, stream& stream,
-    vecmem::data::vector_view<T> vector) {
+template <typename CONTAINER, std::semiregular R, typename VIEW>
+requires std::regular_invocable<R, CONTAINER, std::size_t, std::size_t> bool
+is_ordered_on(R&& relation, vecmem::memory_resource& mr, vecmem::copy& copy,
+              stream& stream, const VIEW& view) {
+
     // This should never be a performance-critical step, so we can keep the
     // block size fixed.
     constexpr int block_size = 512;
 
-    cudaStream_t cuda_stream =
-        reinterpret_cast<cudaStream_t>(stream.cudaStream());
+    cudaStream_t cuda_stream = details::get_stream(stream);
+
+    // Grab the number of elements in our container.
+    uint32_t n = copy.get_size(view);
 
-    // Grab the number of elements in our vector.
-    uint32_t n = copy.get_size(vector);
+    // Exit early for empty containers.
+    if (n == 0) {
+        return true;
+    }
 
     // Initialize the output boolean.
     vecmem::unique_alloc_ptr<bool> out = vecmem::make_unique_alloc<bool>(mr);
@@ -87,9 +92,9 @@ requires std::relation<R, T, T> bool is_ordered_on(
                         cudaMemcpyHostToDevice, cuda_stream));
 
     // Launch the kernel which will write its result to the `out` boolean.
-    kernels::is_ordered_on_kernel<<<(n + block_size - 1) / block_size,
-                                    block_size, 0, cuda_stream>>>(
-        relation, vector, out.get());
+    kernels::is_ordered_on_kernel<CONTAINER>
+        <<<(n + block_size - 1) / block_size, block_size, 0, cuda_stream>>>(
+            relation, view, out.get());
 
     TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());