diff --git a/test/test_device_batch_memcpy.cu b/test/test_device_batch_memcpy.cu index b574f64c06..fc7bfb8d79 100644 --- a/test/test_device_batch_memcpy.cu +++ b/test/test_device_batch_memcpy.cu @@ -97,6 +97,42 @@ void __global__ BaselineBatchMemCpyPerBlockKernel(InputBufferIt input_buffer_it, } } +template +std::vector GetShuffledBufferOffsets(const std::vector &buffer_sizes) +{ + BufferOffsetT num_buffers = buffer_sizes.size(); + + // We're remapping the i-th buffer to pmt_idxs[i] + std::vector pmt_idxs(num_buffers); + std::iota(pmt_idxs.begin(), pmt_idxs.end(), static_cast(0)); + std::shuffle(std::begin(pmt_idxs), std::end(pmt_idxs), std::default_random_engine()); + + // Compute the offsets using the new mapping + ByteOffsetT running_offset = {}; + std::vector permuted_offsets; + permuted_offsets.reserve(num_buffers); + for (auto permuted_buffer_idx : pmt_idxs) + { + permuted_offsets.emplace_back(running_offset); + running_offset += buffer_sizes[permuted_buffer_idx]; + } + + // Generate the scatter indexes that identify where each buffer was mapped to + std::vector scatter_idxs(num_buffers); + for (BufferOffsetT i = 0; i < num_buffers; i++) + { + scatter_idxs[pmt_idxs[i]] = i; + } + + std::vector new_offsets(num_buffers); + for (BufferOffsetT i = 0; i < num_buffers; i++) + { + new_offsets[i] = permuted_offsets[scatter_idxs[i]]; + } + + return new_offsets; +} + template struct OffsetToPtrOp { @@ -120,7 +156,7 @@ enum class InputDataGen * @brief * * @tparam AtomicT The most granular type being copied. All source and destination pointers will be aligned based on - * this type + * this type, the number of bytes being copied will be an integer multiple of this type's size * @tparam BufferOffsetT Type used for indexing into the array of buffers * @tparam BufferSizeT Type used for indexing into individual bytes of a buffer (large enough to cover the max buffer * size) @@ -170,8 +206,14 @@ void RunTest(BufferOffsetT num_buffers, ByteOffsetT num_total_bytes = 0; for (BufferOffsetT i = 0; i < num_buffers; i++) { + if (input_gen == InputDataGen::CONSECUTIVE) + { + h_buffer_src_offsets[i] = num_total_bytes; + } + if (output_gen == InputDataGen::CONSECUTIVE) + { h_buffer_dst_offsets[i] = num_total_bytes; - h_buffer_src_offsets[i] = num_total_bytes; + } num_total_bytes += h_buffer_sizes[i]; } @@ -181,13 +223,13 @@ void RunTest(BufferOffsetT num_buffers, // Shuffle input buffer source-offsets if (input_gen == InputDataGen::RANDOM) { - std::shuffle(std::begin(h_buffer_src_offsets), std::end(h_buffer_src_offsets), std::default_random_engine()); + h_buffer_src_offsets = GetShuffledBufferOffsets(h_buffer_sizes); } // Shuffle input buffer source-offsets if (output_gen == InputDataGen::RANDOM) { - std::shuffle(std::begin(h_buffer_dst_offsets), std::end(h_buffer_dst_offsets), std::default_random_engine()); + h_buffer_dst_offsets = GetShuffledBufferOffsets(h_buffer_sizes); } // Allocate device memory @@ -295,10 +337,15 @@ void RunTest(BufferOffsetT num_buffers, size_t stats_sizes = sizeof(BufferSizeT) * num_buffers; size_t stats_data_copied = 2 * num_total_bytes; - std::cout << "Src size: " << stats_src_offsets << ", dst size: " << stats_dst_offsets - << ", Sizes size: " << stats_sizes << ", data_size: " << stats_data_copied - << ", total: " << (stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) - << ", duration: " << duration << ", BW: " + std::cout << "Min. buffer size: " << min_buffer_size << ", max. buffer size: " << max_buffer_size // + << ", num_buffers: " << num_buffers // + << ", in_gen: " << ((input_gen == InputDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE") // + << ", out_gen: " << ((output_gen == InputDataGen::RANDOM) ? "SHFL" : "CONSECUTIVE") // + << ", src size: " << stats_src_offsets << ", dst size: " << stats_dst_offsets // + << ", sizes size: " << stats_sizes << ", cpy_data_size: " << stats_data_copied // + << ", total: " << (stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) // + << ", duration: " << duration // + << ", BW: " << ((double)(stats_src_offsets + stats_dst_offsets + stats_sizes + stats_data_copied) / 1000000000.0) / (duration / 1000.0) << "\n"; @@ -329,6 +376,8 @@ int main(int argc, char **argv) // Initialize device CubDebugExit(args.DeviceInit()); + // The most granular type being copied, buffer's will be aligned and their size be an integer multiple of this type + using AtomicCopyT = uint32_t; using BufferOffsetT = uint32_t; using BufferSizeT = uint32_t; using ByteOffsetT = uint32_t; @@ -354,13 +403,15 @@ int main(int argc, char **argv) { for (const auto &buffer_size_range : buffer_size_ranges) { - double average_buffer_size = (buffer_size_range.first + buffer_size_range.second) / 2.0; + BufferSizeT min_buffer_size = std::max(static_cast(sizeof(AtomicCopyT)), buffer_size_range.first); + BufferSizeT max_buffer_size = std::max(static_cast(sizeof(AtomicCopyT)), buffer_size_range.second); + double average_buffer_size = (min_buffer_size + max_buffer_size) / 2.0; BufferOffsetT target_num_buffers = target_copy_size / average_buffer_size; - RunTest(target_num_buffers, - buffer_size_range.first, - buffer_size_range.second, - InputDataGen::CONSECUTIVE, - InputDataGen::CONSECUTIVE, + RunTest(target_num_buffers, + min_buffer_size, + max_buffer_size, + InputDataGen::CONSECUTIVE, + InputDataGen::CONSECUTIVE, 10000ULL); } }