Skip to content

Commit

Permalink
Fix subgroup_arithmetic benchmark for flexible subgroup sizes (#46)
Browse files Browse the repository at this point in the history
Set up source data values according to the subgroupSize device property.
But inside the shader, measure the actual subgroup size, and pass it
out of the shader. Then, adjust the verification logic to take that
actual subgroup size into account.

This still does the same amount of arithmetic as in the original
code, and it does it in the same shape.

Fixes: #45
  • Loading branch information
dneto0 authored Nov 27, 2023
1 parent 90866c8 commit c9fe9ca
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 16 deletions.
5 changes: 4 additions & 1 deletion benchmarks/subgroup/subgroup_arithmetic_intrinsic.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#extension GL_KHR_shader_subgroup_basic : enable
#extension GL_KHR_shader_subgroup_arithmetic : enable
#extension GL_KHR_shader_subgroup_ballot : enable

layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;

Expand All @@ -28,12 +29,13 @@ layout(set = 0, binding = 0) buffer InputBuffer {
// Use an output buffer of the same size to make sure we use each element
// in the input buffer.
layout(set = 0, binding = 1) buffer OutputBuffer {
uint actual_subgroup_size;
float output_values[kArraySize];
};

void main() {
uint index = gl_GlobalInvocationID.x;
uint count = gl_SubgroupSize;
uint subgroup_size = subgroupBallotBitCount(subgroupBallot(true));
float value = 0.f;

#ifdef ARITHMETIC_ADD
Expand All @@ -48,6 +50,7 @@ void main() {
value = input_values[index];
}

actual_subgroup_size = subgroup_size;
output_values[index] = value;
}

6 changes: 5 additions & 1 deletion benchmarks/subgroup/subgroup_arithmetic_loop.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#version 450 core

#extension GL_KHR_shader_subgroup_basic : enable
#extension GL_KHR_shader_subgroup_ballot : enable

layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in;

Expand All @@ -27,12 +28,14 @@ layout(set = 0, binding = 0) buffer InputBuffer {
// Use an output buffer of the same size to make sure we use each element
// in the input buffer.
layout(set = 0, binding = 1) buffer OutputBuffer {
uint actual_subgroup_size;
float output_values[kArraySize];
};

void main() {
uint index = gl_GlobalInvocationID.x;
uint count = gl_SubgroupSize;
uint subgroup_size = subgroupBallotBitCount(subgroupBallot(true));
uint count = subgroup_size;
float value = 0.f;

if (subgroupElect()) {
Expand All @@ -50,5 +53,6 @@ void main() {
value = input_values[index];
}

actual_subgroup_size = subgroup_size;
output_values[index] = value;
}
32 changes: 18 additions & 14 deletions benchmarks/subgroup/subgroup_arithmetic_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ static void CalculateSubgroupArithmetic(
::benchmark::State &state, ::uvkc::vulkan::Device *device,
const ::uvkc::benchmark::LatencyMeasure *latency_measure,
const uint32_t *code, size_t code_num_words, int num_elements,
uint32_t subgroup_size, Arithmetic arith_op) {
uint32_t proposed_subgroup_size, Arithmetic arith_op) {
size_t buffer_num_bytes = num_elements * sizeof(float);

//===-------------------------------------------------------------------===/
Expand Down Expand Up @@ -116,10 +116,11 @@ static void CalculateSubgroupArithmetic(
//===-------------------------------------------------------------------===/

// +: fill the whole buffer as 1.0f.
// *: fill with alternating subgroup_size and (1 / subgroup_size).
// *: fill with alternating values of proposed_subgroup_size and
// (1 / proposed_subgroup_size).
BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer(
device, src_buffer.get(), buffer_num_bytes,
[arith_op, subgroup_size](void *ptr, size_t num_bytes) {
[arith_op, proposed_subgroup_size](void *ptr, size_t num_bytes) {
float *src_float_buffer = reinterpret_cast<float *>(ptr);
switch (arith_op) {
case Arithmetic::Add: {
Expand All @@ -129,8 +130,8 @@ static void CalculateSubgroupArithmetic(
} break;
case Arithmetic::Mul: {
for (int i = 0; i < num_bytes / sizeof(float); i += 2) {
src_float_buffer[i] = subgroup_size;
src_float_buffer[i + 1] = 1.0f / subgroup_size;
src_float_buffer[i] = proposed_subgroup_size;
src_float_buffer[i + 1] = 1.0f / proposed_subgroup_size;
}
} break;
}
Expand Down Expand Up @@ -171,14 +172,17 @@ static void CalculateSubgroupArithmetic(

BM_CHECK_OK(::uvkc::benchmark::GetDeviceBufferViaStagingBuffer(
device, dst_buffer.get(), buffer_num_bytes,
[arith_op, subgroup_size](void *ptr, size_t num_bytes) {
float *dst_float_buffer = reinterpret_cast<float *>(ptr);
[arith_op, proposed_subgroup_size](void *ptr, size_t num_bytes) {
const uint32_t actual_subgroup_size =
reinterpret_cast<uint32_t *>(ptr)[0];
float *dst_float_buffer = reinterpret_cast<float *>(ptr) + 1;
const auto num_floats = (num_bytes / sizeof(float)) - 1;
switch (arith_op) {
case Arithmetic::Add: {
for (int i = 0; i < num_bytes / sizeof(float); ++i) {
for (int i = 0; i < num_floats; ++i) {
float expected_value = 1.0f;
if (i % subgroup_size == 0) {
expected_value = subgroup_size;
if (i % actual_subgroup_size == 0) {
expected_value = actual_subgroup_size;
}

BM_CHECK_EQ(dst_float_buffer[i], expected_value)
Expand All @@ -188,14 +192,14 @@ static void CalculateSubgroupArithmetic(
}
} break;
case Arithmetic::Mul: {
for (int i = 0; i < num_bytes / sizeof(float); ++i) {
for (int i = 0; i < num_floats; ++i) {
float expected_value = 0.0f;
if (i % subgroup_size == 0) {
if (i % actual_subgroup_size == 0) {
expected_value = 1.0f;
} else if (i % 2 == 0) {
expected_value = subgroup_size;
expected_value = proposed_subgroup_size;
} else {
expected_value = 1.0f / subgroup_size;
expected_value = 1.0f / proposed_subgroup_size;
}

BM_CHECK_EQ(dst_float_buffer[i], expected_value)
Expand Down

0 comments on commit c9fe9ca

Please sign in to comment.