Skip to content

Commit

Permalink
Made the CUDA sanity checks work with SoA containers.
Browse files Browse the repository at this point in the history
  • Loading branch information
krasznaa committed Sep 21, 2024
1 parent 28e8247 commit 7fe10dc
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 100 deletions.
11 changes: 6 additions & 5 deletions device/cuda/src/clusterization/clusterization_algorithm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,6 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
const edm::silicon_cell_collection::const_view& cells,
const silicon_detector_description::const_view& det_descr) const {

// assert(is_contiguous_on(cell_module_projection(), m_mr.main, m_copy,
// m_stream, cells));
// assert(is_ordered_on(channel0_major_cell_order_relation(), m_mr.main,
// m_copy, m_stream, cells));

// Get a convenience variable for the stream that we'll be using.
cudaStream_t stream = details::get_stream(m_stream);

Expand All @@ -114,6 +109,12 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
return measurements;
}

assert(is_contiguous_on<edm::silicon_cell_collection::const_device>(
cell_module_projection(), m_mr.main, m_copy, m_stream, cells));
assert(is_ordered_on<edm::silicon_cell_collection::const_device>(
channel0_major_cell_order_relation(), m_mr.main, m_copy, m_stream,
cells));

// Create buffer for linking cells to their measurements.
//
// @todo Construct cell clusters on demand in a member function for
Expand Down
5 changes: 3 additions & 2 deletions device/cuda/src/finding/finding_algorithm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,9 @@ finding_algorithm<stepper_t, navigator_t>::operator()(
measurement_collection_types::const_view::size_type n_measurements =
m_copy.get_size(measurements);

assert(is_contiguous_on(measurement_module_projection(), m_mr.main, m_copy,
m_stream, measurements));
assert(is_contiguous_on<measurement_collection_types::const_device>(
measurement_module_projection(), m_mr.main, m_copy, m_stream,
measurements));

// Get copy of barcode uniques
measurement_collection_types::buffer uniques_buffer{n_measurements,
Expand Down
61 changes: 34 additions & 27 deletions device/cuda/src/sanity/contiguous_on.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@

// Project include(s).
#include "../utils/cuda_error_handling.hpp"
#include "../utils/utils.hpp"
#include "traccc/cuda/utils/stream.hpp"

// VecMem include(s).
#include <vecmem/containers/data/vector_view.hpp>
#include <vecmem/containers/device_vector.hpp>
#include <vecmem/memory/memory_resource.hpp>
#include <vecmem/memory/unique_ptr.hpp>
#include <vecmem/utils/copy.hpp>
Expand All @@ -27,23 +26,23 @@

namespace traccc::cuda {
namespace kernels {
template <std::semiregular P, typename T, typename S>
requires std::regular_invocable<P, T> __global__ void compress_adjacent(
P projection, vecmem::data::vector_view<T> _in, S* out,
uint32_t* out_size) {
template <typename CONTAINER, std::semiregular P, typename VIEW, typename S>
requires std::regular_invocable<P, CONTAINER, std::size_t> __global__ void
compress_adjacent(P projection, VIEW _in, S* out, uint32_t* out_size) {

int tid = threadIdx.x + blockIdx.x * blockDim.x;

vecmem::device_vector<T> in(_in);
CONTAINER in(_in);

if (tid > 0 && tid < in.size()) {
std::invoke_result_t<P, T> v1 = projection(in.at(tid - 1));
std::invoke_result_t<P, T> v2 = projection(in.at(tid));
S v1 = projection(in, tid - 1);
S v2 = projection(in, tid);

if (v1 != v2) {
out[atomicAdd(out_size, 1u)] = v2;
}
} else if (tid == 0) {
out[atomicAdd(out_size, 1u)] = projection(in.at(tid));
out[atomicAdd(out_size, 1u)] = projection(in, tid);
}
}

Expand All @@ -59,40 +58,47 @@ __global__ void all_unique(const T* in, const size_t n, bool* out) {
} // namespace kernels

/**
* @brief Sanity check that a given vector is contiguous on a given projection.
* @brief Sanity check that a given container is contiguous on a given
* projection.
*
* For a vector $v$ to be contiguous on a projection $\pi$, it must be the case
* that for all indices $i$ and $j$, if $v_i = v_j$, then all indices $k$
* For a container $v$ to be contiguous on a projection $\pi$, it must be the
* case that for all indices $i$ and $j$, if $v_i = v_j$, then all indices $k$
* between $i$ and $j$, $v_i = v_j = v_k$.
*
* @note This function runs in O(n^2) time.
*
* @tparam CONTAINER The type of the (device) container.
* @tparam P The type of projection $\pi$, a callable which returns some
* comparable type.
* @tparam T The type of the vector.
* @tparam VIEW The type of the view for the container.
* @param projection A projection object of type `P`.
* @param mr A memory resource used for allocating intermediate memory.
* @param vector The vector which to check for contiguity.
* @return true If the vector is contiguous on `P`.
* @param view The container which to check for contiguity.
* @return true If the container is contiguous on `P`.
* @return false Otherwise.
*/
template <std::semiregular P, std::equality_comparable T>
requires std::regular_invocable<P, T> bool is_contiguous_on(
P&& projection, vecmem::memory_resource& mr, vecmem::copy& copy,
stream& stream, vecmem::data::vector_view<T> vector) {
template <typename CONTAINER, std::semiregular P, typename VIEW>
requires std::regular_invocable<P, CONTAINER, std::size_t> bool
is_contiguous_on(P&& projection, vecmem::memory_resource& mr,
vecmem::copy& copy, stream& stream, const VIEW& view) {

// This should never be a performance-critical step, so we can keep the
// block size fixed.
constexpr int block_size = 512;
constexpr int block_size_2d = 32;

cudaStream_t cuda_stream =
reinterpret_cast<cudaStream_t>(stream.cudaStream());
cudaStream_t cuda_stream = details::get_stream(stream);

// Grab the number of elements in our vector.
uint32_t n = copy.get_size(vector);
// Grab the number of elements in our container.
std::size_t n = copy.get_size(view);

// Exit early for empty containers.
if (n == 0) {
return true;
}

// Get the output type of the projection.
using projection_t = std::invoke_result_t<P, T>;
using projection_t = std::invoke_result_t<P, CONTAINER, std::size_t>;

// Allocate memory for intermediate values and outputs, then set them up.
vecmem::unique_alloc_ptr<projection_t[]> iout =
Expand All @@ -113,9 +119,9 @@ requires std::regular_invocable<P, T> bool is_contiguous_on(

// Launch the first kernel, which will squash consecutive equal elements
// into one element.
kernels::compress_adjacent<P, T, projection_t>
kernels::compress_adjacent<CONTAINER>
<<<(n + block_size - 1) / block_size, block_size, 0, cuda_stream>>>(
projection, vector, iout.get(), iout_size.get());
projection, view, iout.get(), iout_size.get());

TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());

Expand All @@ -126,6 +132,7 @@ requires std::regular_invocable<P, T> bool is_contiguous_on(
TRACCC_CUDA_ERROR_CHECK(
cudaMemcpyAsync(&host_iout_size, iout_size.get(), sizeof(uint32_t),
cudaMemcpyDeviceToHost, cuda_stream));
stream.synchronize();

// Launch the second kernel, which will check if the values are unique.
uint32_t grid_size_rd =
Expand Down
51 changes: 28 additions & 23 deletions device/cuda/src/sanity/ordered_on.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@

// Project include(s).
#include "../utils/cuda_error_handling.hpp"
#include "../utils/utils.hpp"
#include "traccc/cuda/utils/stream.hpp"

// VecMem include(s).
#include <vecmem/containers/data/vector_view.hpp>
#include <vecmem/containers/device_vector.hpp>
#include <vecmem/memory/memory_resource.hpp>
#include <vecmem/memory/unique_ptr.hpp>
#include <vecmem/utils/copy.hpp>
Expand All @@ -27,25 +26,25 @@

namespace traccc::cuda {
namespace kernels {
template <std::semiregular R, typename T>
requires std::relation<R, T, T> __global__ void is_ordered_on_kernel(
R relation, vecmem::data::vector_view<T> _in, bool* out) {
template <typename CONTAINER, std::semiregular R, typename VIEW>
requires std::regular_invocable<R, CONTAINER, std::size_t, std::size_t>
__global__ void is_ordered_on_kernel(R relation, VIEW _in, bool* out) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;

vecmem::device_vector<T> in(_in);
CONTAINER in(_in);

if (tid > 0 && tid < in.size()) {
if (!relation(in.at(tid - 1), in.at(tid))) {
if (!relation(in, tid - 1, tid)) {
*out = false;
}
}
}
} // namespace kernels

/**
* @brief Sanity check that a given vector is ordered on a given relation.
* @brief Sanity check that a given container is ordered on a given relation.
*
* For a vector $v$ to be ordered on a relation $R$, it must be the case that
* For a container $v$ to be ordered on a relation $R$, it must be the case that
* for all indices $i$ and $j$, if $i < j$, then $R(i, j)$.
*
* @note This function runs in O(n) time.
Expand All @@ -56,28 +55,34 @@ requires std::relation<R, T, T> __global__ void is_ordered_on_kernel(
*
* @note For any strict weak order $R$, `is_ordered_on(sort(R, v))` is true.
*
* @tparam CONTAINER The type of the (device) container.
* @tparam R The type of relation $R$, a callable which returns a bool if the
* first argument can be immediately before the second type.
* @tparam T The type of the vector.
* @tparam VIEW The type of the view for the container.
* @param relation A relation object of type `R`.
* @param mr A memory resource used for allocating intermediate memory.
* @param vector The vector which to check for ordering.
* @return true If the vector is ordered on `R`.
* @param view The container which to check for ordering.
* @return true If the container is ordered on `R`.
* @return false Otherwise.
*/
template <std::semiregular R, typename T>
requires std::relation<R, T, T> bool is_ordered_on(
R relation, vecmem::memory_resource& mr, vecmem::copy& copy, stream& stream,
vecmem::data::vector_view<T> vector) {
template <typename CONTAINER, std::semiregular R, typename VIEW>
requires std::regular_invocable<R, CONTAINER, std::size_t, std::size_t> bool
is_ordered_on(R&& relation, vecmem::memory_resource& mr, vecmem::copy& copy,
stream& stream, const VIEW& view) {

// This should never be a performance-critical step, so we can keep the
// block size fixed.
constexpr int block_size = 512;

cudaStream_t cuda_stream =
reinterpret_cast<cudaStream_t>(stream.cudaStream());
cudaStream_t cuda_stream = details::get_stream(stream);

// Grab the number of elements in our container.
uint32_t n = copy.get_size(view);

// Grab the number of elements in our vector.
uint32_t n = copy.get_size(vector);
// Exit early for empty containers.
if (n == 0) {
return true;
}

// Initialize the output boolean.
vecmem::unique_alloc_ptr<bool> out = vecmem::make_unique_alloc<bool>(mr);
Expand All @@ -87,9 +92,9 @@ requires std::relation<R, T, T> bool is_ordered_on(
cudaMemcpyHostToDevice, cuda_stream));

// Launch the kernel which will write its result to the `out` boolean.
kernels::is_ordered_on_kernel<<<(n + block_size - 1) / block_size,
block_size, 0, cuda_stream>>>(
relation, vector, out.get());
kernels::is_ordered_on_kernel<CONTAINER>
<<<(n + block_size - 1) / block_size, block_size, 0, cuda_stream>>>(
relation, view, out.get());

TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());

Expand Down
Loading

0 comments on commit 7fe10dc

Please sign in to comment.