forked from bevyengine/bevy
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Meshlet fill cluster buffers rewritten (bevyengine#15955)
# Objective - Make the meshlet fill cluster buffers pass slightly faster - Address bevyengine#15920 for meshlets - Added PreviousGlobalTransform as a required meshlet component to avoid extra archetype moves, slightly alleviating bevyengine#14681 for meshlets - Enforce that MeshletPlugin::cluster_buffer_slots is not greater than 2^25 (glitches will occur otherwise). Technically this field controls post-lod/culling cluster count, and the issue is on pre-lod/culling cluster count, but it's still valid now, and in the future this will be more true. Needs to be merged after bevyengine#15846 and bevyengine#15886 ## Solution - Old pass dispatched a thread per cluster, and did a binary search over the instances to find which instance the cluster belongs to, and what meshlet index within the instance it is. - New pass dispatches a workgroup per instance, and has the workgroup loop over all meshlets in the instance in order to write out the cluster data. - Use a push constant instead of arrayLength to fix the linked bug - Remap 1d->2d dispatch for software raster only if actually needed to save on spawning excess workgroups ## Testing - Did you test these changes? If so, how? - Ran the meshlet example, and an example with 1041 instances of 32217 meshlets per instance. Profiled the second scene with nsight, went from 0.55ms -> 0.40ms. Small savings. We're pretty much VRAM bandwidth bound at this point. - How can other people (reviewers) test your changes? Is there anything specific they need to know? - Run the meshlet example ## Changelog (non-meshlets) - PreviousGlobalTransform now implements the Default trait
- Loading branch information
Showing
11 changed files
with
140 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,44 +1,50 @@ | ||
#import bevy_pbr::meshlet_bindings::{ | ||
cluster_count, | ||
meshlet_instance_meshlet_counts_prefix_sum, | ||
scene_instance_count, | ||
meshlet_global_cluster_count, | ||
meshlet_instance_meshlet_counts, | ||
meshlet_instance_meshlet_slice_starts, | ||
meshlet_cluster_instance_ids, | ||
meshlet_cluster_meshlet_ids, | ||
} | ||
|
||
/// Writes out instance_id and meshlet_id to the global buffers for each cluster in the scene. | ||
|
||
var<workgroup> cluster_slice_start_workgroup: u32; | ||
|
||
@compute | ||
@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1 cluster per thread | ||
@workgroup_size(1024, 1, 1) // 1024 threads per workgroup, 1 instance per workgroup | ||
fn fill_cluster_buffers( | ||
@builtin(workgroup_id) workgroup_id: vec3<u32>, | ||
@builtin(num_workgroups) num_workgroups: vec3<u32>, | ||
@builtin(local_invocation_index) local_invocation_index: u32, | ||
) { | ||
// Calculate the cluster ID for this thread | ||
let cluster_id = local_invocation_index + 128u * dot(workgroup_id, vec3(num_workgroups.x * num_workgroups.x, num_workgroups.x, 1u)); | ||
if cluster_id >= cluster_count { return; } // TODO: Could be an arrayLength? | ||
|
||
// Binary search to find the instance this cluster belongs to | ||
var left = 0u; | ||
var right = arrayLength(&meshlet_instance_meshlet_counts_prefix_sum) - 1u; | ||
while left <= right { | ||
let mid = (left + right) / 2u; | ||
if meshlet_instance_meshlet_counts_prefix_sum[mid] <= cluster_id { | ||
left = mid + 1u; | ||
} else { | ||
right = mid - 1u; | ||
} | ||
// Calculate the instance ID for this workgroup | ||
var instance_id = workgroup_id.x + (workgroup_id.y * num_workgroups.x); | ||
if instance_id >= scene_instance_count { return; } | ||
|
||
let instance_meshlet_count = meshlet_instance_meshlet_counts[instance_id]; | ||
let instance_meshlet_slice_start = meshlet_instance_meshlet_slice_starts[instance_id]; | ||
|
||
// Reserve cluster slots for the instance and broadcast to the workgroup | ||
if local_invocation_index == 0u { | ||
cluster_slice_start_workgroup = atomicAdd(&meshlet_global_cluster_count, instance_meshlet_count); | ||
} | ||
let instance_id = right; | ||
let cluster_slice_start = workgroupUniformLoad(&cluster_slice_start_workgroup); | ||
|
||
// Find the meshlet ID for this cluster within the instance's MeshletMesh | ||
let meshlet_id_local = cluster_id - meshlet_instance_meshlet_counts_prefix_sum[instance_id]; | ||
// Loop enough times to write out all the meshlets for the instance given that each thread writes 1 meshlet in each iteration | ||
for (var clusters_written = 0u; clusters_written < instance_meshlet_count; clusters_written += 1024u) { | ||
// Calculate meshlet ID within this instance's MeshletMesh to process for this thread | ||
let meshlet_id_local = clusters_written + local_invocation_index; | ||
if meshlet_id_local >= instance_meshlet_count { return; } | ||
|
||
// Find the overall meshlet ID in the global meshlet buffer | ||
let meshlet_id = meshlet_id_local + meshlet_instance_meshlet_slice_starts[instance_id]; | ||
// Find the overall cluster ID in the global cluster buffer | ||
let cluster_id = cluster_slice_start + meshlet_id_local; | ||
|
||
// Write results to buffers | ||
meshlet_cluster_instance_ids[cluster_id] = instance_id; | ||
meshlet_cluster_meshlet_ids[cluster_id] = meshlet_id; | ||
// Find the overall meshlet ID in the global meshlet buffer | ||
let meshlet_id = instance_meshlet_slice_start + meshlet_id_local; | ||
|
||
// Write results to buffers | ||
meshlet_cluster_instance_ids[cluster_id] = instance_id; | ||
meshlet_cluster_meshlet_ids[cluster_id] = meshlet_id; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.