Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
add option for shuffled buffer benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
elstehle committed Oct 13, 2021
1 parent 4f44fae commit 8f6d447
Showing 1 changed file with 65 additions and 14 deletions.
79 changes: 65 additions & 14 deletions test/test_device_batch_memcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,42 @@ void __global__ BaselineBatchMemCpyPerBlockKernel(InputBufferIt input_buffer_it,
}
}

template <typename BufferOffsetT, typename ByteOffsetT, typename BufferSizeT>
std::vector<ByteOffsetT> GetShuffledBufferOffsets(const std::vector<BufferSizeT> &buffer_sizes)
{
BufferOffsetT num_buffers = buffer_sizes.size();

// We're remapping the i-th buffer to pmt_idxs[i]
std::vector<BufferOffsetT> pmt_idxs(num_buffers);
std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast<BufferOffsetT>(0));
std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), std::default_random_engine());

// Compute the offsets using the new mapping
ByteOffsetT running_offset = {};
std::vector<BufferOffsetT> permuted_offsets;
permuted_offsets.reserve(num_buffers);
for (auto permuted_buffer_idx : pmt_idxs)
{
permuted_offsets.emplace_back(running_offset);
running_offset += buffer_sizes[permuted_buffer_idx];
}

// Generate the scatter indexes that identify where each buffer was mapped to
std::vector<BufferOffsetT> scatter_idxs(num_buffers);
for (BufferOffsetT i = 0; i < num_buffers; i++)
{
scatter_idxs[pmt_idxs[i]] = i;
}

std::vector<BufferOffsetT> new_offsets(num_buffers);
for (BufferOffsetT i = 0; i < num_buffers; i++)
{
new_offsets[i] = permuted_offsets[scatter_idxs[i]];
}

return new_offsets;
}

template <typename IteratorT>
struct OffsetToPtrOp
{
Expand All @@ -120,7 +156,7 @@ enum class InputDataGen
* @brief
*
* @tparam AtomicT The most granular type being copied. All source and destination pointers will be aligned based on
* this type
* this type, the number of bytes being copied will be an integer multiple of this type's size
* @tparam BufferOffsetT Type used for indexing into the array of buffers
* @tparam BufferSizeT Type used for indexing into individual bytes of a buffer (large enough to cover the max buffer
* size)
Expand Down Expand Up @@ -170,8 +206,14 @@ void RunTest(BufferOffsetT num_buffers,
ByteOffsetT num_total_bytes = 0;
for (BufferOffsetT i = 0; i < num_buffers; i++)
{
if (input_gen == InputDataGen::CONSECUTIVE)
{
h_buffer_src_offsets[i] = num_total_bytes;
}
if (output_gen == InputDataGen::CONSECUTIVE)
{
h_buffer_dst_offsets[i] = num_total_bytes;
h_buffer_src_offsets[i] = num_total_bytes;
}
num_total_bytes += h_buffer_sizes[i];
}

Expand All @@ -181,13 +223,13 @@ void RunTest(BufferOffsetT num_buffers,
// Shuffle input buffer source-offsets
if (input_gen == InputDataGen::RANDOM)
{
std::shuffle(std::begin(h_buffer_src_offsets), std::end(h_buffer_src_offsets), std::default_random_engine());
h_buffer_src_offsets = GetShuffledBufferOffsets<BufferOffsetT, ByteOffsetT>(h_buffer_sizes);
}

// Shuffle input buffer source-offsets
if (output_gen == InputDataGen::RANDOM)
{
std::shuffle(std::begin(h_buffer_dst_offsets), std::end(h_buffer_dst_offsets), std::default_random_engine());
h_buffer_dst_offsets = GetShuffledBufferOffsets<BufferOffsetT, ByteOffsetT>(h_buffer_sizes);
}

// Allocate device memory
Expand Down Expand Up @@ -295,10 +337,15 @@ void RunTest(BufferOffsetT num_buffers,
size_t stats_sizes = sizeof(BufferSizeT) * num_buffers;
size_t stats_data_copied = 2 * num_total_bytes;

std::cout << "Src size: " << stats_src_offsets << ", dst size: " << stats_dst_offsets
<< ", Sizes size: " << stats_sizes << ", data_size: " << stats_data_copied
<< ", total: " << (stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied)
<< ", duration: " << duration << ", BW: "
std::cout << "Min. buffer size: " << min_buffer_size << ", max. buffer size: " << max_buffer_size //
<< ", num_buffers: " << num_buffers //
<< ", in_gen: " << ((input_gen == InputDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE") //
<< ", out_gen: " << ((output_gen == InputDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE") //
<< ", src size: " << stats_src_offsets << ", dst size: " << stats_dst_offsets //
<< ", sizes size: " << stats_sizes << ", cpy_data_size: " << stats_data_copied //
<< ", total: " << (stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) //
<< ", duration: " << duration //
<< ", BW: "
<< ((double)(stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) / 1000000000.0) /
(duration / 1000.0)
<< "\n";
Expand Down Expand Up @@ -329,6 +376,8 @@ int main(int argc, char **argv)
// Initialize device
CubDebugExit(args.DeviceInit());

// The most granular type being copied, buffer's will be aligned and their size be an integer multiple of this type
using AtomicCopyT = uint32_t;
using BufferOffsetT = uint32_t;
using BufferSizeT = uint32_t;
using ByteOffsetT = uint32_t;
Expand All @@ -354,13 +403,15 @@ int main(int argc, char **argv)
{
for (const auto &buffer_size_range : buffer_size_ranges)
{
double average_buffer_size = (buffer_size_range.first + buffer_size_range.second) / 2.0;
BufferSizeT min_buffer_size = std::max(static_cast<BufferSizeT>(sizeof(AtomicCopyT)), buffer_size_range.first);
BufferSizeT max_buffer_size = std::max(static_cast<BufferSizeT>(sizeof(AtomicCopyT)), buffer_size_range.second);
double average_buffer_size = (min_buffer_size + max_buffer_size) / 2.0;
BufferOffsetT target_num_buffers = target_copy_size / average_buffer_size;
RunTest<uint64_t, BufferOffsetT, BufferSizeT, ByteOffsetT>(target_num_buffers,
buffer_size_range.first,
buffer_size_range.second,
InputDataGen::CONSECUTIVE,
InputDataGen::CONSECUTIVE,
RunTest<AtomicCopyT, BufferOffsetT, BufferSizeT, ByteOffsetT>(target_num_buffers,
min_buffer_size,
max_buffer_size,
InputDataGen::CONSECUTIVE,
InputDataGen::CONSECUTIVE,
10000ULL);
}
}
Expand Down

0 comments on commit 8f6d447

Please sign in to comment.