Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
adds more benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
elstehle committed Oct 11, 2021
1 parent 38a028e commit d3086fc
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 81 deletions.
22 changes: 12 additions & 10 deletions cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ private:
static constexpr uint32_t TLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD;
static constexpr uint32_t BLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD;

static constexpr uint32_t WARP_LEVEL_THRESHOLD = 32;
static constexpr uint32_t WARP_LEVEL_THRESHOLD = 128;
static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = 8 * 1024;

static constexpr uint32_t BUFFER_STABLE_PARTITION = false;
Expand Down Expand Up @@ -549,23 +549,25 @@ private:
BufferSizeIteratorT tile_buffer_sizes,
BlockBufferOffsetT num_wlev_buffers)
{
constexpr size_t out_datatype_size = sizeof(uint4);
constexpr size_t in_datatype_size = sizeof(uint);
constexpr uint32_t out_datatype_size = sizeof(uint4);
constexpr uint32_t in_datatype_size = sizeof(uint32_t);

int warp_id = threadIdx.x / CUB_PTX_WARP_THREADS;
int warp_lane = threadIdx.x % CUB_PTX_WARP_THREADS;
constexpr uint32_t WARPS_PER_BLOCK = BLOCK_THREADS / CUB_PTX_WARP_THREADS;

for (BlockBufferOffsetT istring = warp_id; istring < num_wlev_buffers; istring += WARPS_PER_BLOCK)
for (BlockBufferOffsetT buffer_offset = warp_id; buffer_offset < num_wlev_buffers; buffer_offset += WARPS_PER_BLOCK)
{
uint8_t *out_chars = reinterpret_cast<uint8_t *>(tile_buffer_dsts[buffers_by_size_class[istring].buffer_id]);
auto const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % out_datatype_size;
uint4 *out_chars_aligned = reinterpret_cast<uint4 *>(out_chars - alignment_offset);
uint8_t *out_chars =
reinterpret_cast<uint8_t *>(tile_buffer_dsts[buffers_by_size_class[buffer_offset].buffer_id]);
uint32_t const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % out_datatype_size;
uint4 *out_chars_aligned = reinterpret_cast<uint4 *>(out_chars - alignment_offset);

auto const out_start = 0;
auto const out_end = out_start + tile_buffer_sizes[buffers_by_size_class[istring].buffer_id];
uint32_t const out_start = 0U;
uint32_t const out_end = out_start + tile_buffer_sizes[buffers_by_size_class[buffer_offset].buffer_id];

const char *in_start = reinterpret_cast<const char *>(tile_buffer_srcs[buffers_by_size_class[istring].buffer_id]);
const char *in_start =
reinterpret_cast<const char *>(tile_buffer_srcs[buffers_by_size_class[buffer_offset].buffer_id]);

// Both `out_start_aligned` and `out_end_aligned` are indices into `out_chars`.
// `out_start_aligned` is the first 16B aligned memory location after `out_start + 4`.
Expand Down
14 changes: 7 additions & 7 deletions cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ template <typename BufferOffsetT, typename BlockOffsetT>
struct DeviceBatchMemcpyPolicy
{
static constexpr uint32_t BLOCK_THREADS = 128U;
static constexpr uint32_t BUFFERS_PER_THREAD = 2U;
static constexpr uint32_t BUFFERS_PER_THREAD = 4U;
static constexpr uint32_t TLEV_BYTES_PER_THREAD = 8U;

static constexpr uint32_t LARGE_BUFFER_BLOCK_THREADS = 256U;
Expand Down Expand Up @@ -464,7 +464,7 @@ struct DispatchBatchMemcpy : DeviceBatchMemcpyPolicy<BufferOffsetT, BlockOffsetT
ActivePolicyT::AgentSmallBufferPolicyT::BUFFERS_PER_THREAD;

// The number of thread blocks (or tiles) required to process all of the given buffers
BlockOffsetT num_tiles = cub::DivideAndRoundUp(num_buffers, TILE_SIZE);
BlockOffsetT num_tiles = DivideAndRoundUp(num_buffers, TILE_SIZE);

// Temporary storage allocation requirements
void *allocations[MEM_NUM_ALLOCATIONS] = {};
Expand Down Expand Up @@ -515,8 +515,8 @@ struct DispatchBatchMemcpy : DeviceBatchMemcpyPolicy<BufferOffsetT, BlockOffsetT
reinterpret_cast<BlevBufferTileOffsetsOutItT>(allocations[MEM_BLEV_BUFFER_TBLOCK]);

// Kernels' grid sizes
uint32_t init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
uint32_t scan_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
uint32_t init_grid_size = DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
uint32_t scan_grid_size = DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS);
uint32_t batch_memcpy_grid_size = num_tiles;

// Kernels
Expand Down Expand Up @@ -607,9 +607,9 @@ struct DispatchBatchMemcpy : DeviceBatchMemcpyPolicy<BufferOffsetT, BlockOffsetT

// Invoke kernel to copy small buffers and put the larger ones into a queue that will get picked up by next kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(batch_memcpy_grid_size,
ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS,
0,
stream)
ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS,
0,
stream)
.doit(bach_memcpy_non_blev_kernel,
input_buffer_it,
output_buffer_it,
Expand Down
Loading

0 comments on commit d3086fc

Please sign in to comment.