From 92a17c9979980c643fdf4484e74df5d32a598b18 Mon Sep 17 00:00:00 2001 From: James Liu Date: Mon, 25 Sep 2023 12:15:37 -0700 Subject: [PATCH] Directly copy data into uniform buffers (#9865) # Objective This is a minimally disruptive version of #8340. I attempted to update it, but failed due to the scope of the changes added in #8204. Fixes #8307. Partially addresses #4642. As seen in https://github.com/bevyengine/bevy/issues/8284, we're actually copying data twice in Prepare stage systems. Once into a CPU-side intermediate scratch buffer, and once again into a mapped buffer. This is inefficient and effectively doubles the time spent and memory allocated to run these systems. ## Solution Skip the scratch buffer entirely and use `wgpu::Queue::write_buffer_with` to directly write data into mapped buffers. Separately, this also directly uses `wgpu::Limits::min_uniform_buffer_offset_alignment` to set up the alignment when writing to the buffers. Partially addressing the issue raised in #4642. Storage buffers and the abstractions built on top of `DynamicUniformBuffer` will need to come in followup PRs. This may not have a noticeable performance difference in this PR, as the only first-party systems affected by this are view related, and likely are not going to be particularly heavy. --- ## Changelog Added: `DynamicUniformBuffer::get_writer`. Added: `DynamicUniformBufferWriter`. --- crates/bevy_pbr/src/prepass/mod.rs | 19 ++-- crates/bevy_pbr/src/render/fog.rs | 18 +-- crates/bevy_pbr/src/render/light.rs | 16 ++- crates/bevy_render/src/extract_component.rs | 19 ++-- .../src/render_resource/uniform_buffer.rs | 105 +++++++++++++++++- crates/bevy_render/src/view/mod.rs | 17 +-- 6 files changed, 154 insertions(+), 40 deletions(-) diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs index ca1caf5c44720..9169d1083fde1 100644 --- a/crates/bevy_pbr/src/prepass/mod.rs +++ b/crates/bevy_pbr/src/prepass/mod.rs @@ -667,9 +667,16 @@ pub fn prepare_previous_view_projection_uniforms( With, >, ) { - view_uniforms.uniforms.clear(); - - for (entity, camera, maybe_previous_view_proj) in &views { + let views_iter = views.iter(); + let view_count = views_iter.len(); + let Some(mut writer) = + view_uniforms + .uniforms + .get_writer(view_count, &render_device, &render_queue) + else { + return; + }; + for (entity, camera, maybe_previous_view_proj) in views_iter { let view_projection = match maybe_previous_view_proj { Some(previous_view) => previous_view.clone(), None => PreviousViewProjection { @@ -679,13 +686,9 @@ pub fn prepare_previous_view_projection_uniforms( commands .entity(entity) .insert(PreviousViewProjectionUniformOffset { - offset: view_uniforms.uniforms.push(view_projection), + offset: writer.write(&view_projection), }); } - - view_uniforms - .uniforms - .write_buffer(&render_device, &render_queue); } #[derive(Default, Resource)] diff --git a/crates/bevy_pbr/src/render/fog.rs b/crates/bevy_pbr/src/render/fog.rs index 10fb61ff2bcc9..4df01418f1d30 100644 --- a/crates/bevy_pbr/src/render/fog.rs +++ b/crates/bevy_pbr/src/render/fog.rs @@ -52,9 +52,15 @@ pub fn prepare_fog( mut fog_meta: ResMut, views: Query<(Entity, Option<&FogSettings>), With>, ) { - fog_meta.gpu_fogs.clear(); - - for (entity, fog) in &views { + let views_iter = views.iter(); + let view_count = views_iter.len(); + let Some(mut writer) = fog_meta + .gpu_fogs + .get_writer(view_count, &render_device, &render_queue) + else { + return; + }; + for (entity, fog) in views_iter { let gpu_fog = if let Some(fog) = fog { match &fog.falloff { FogFalloff::Linear { start, end } => GpuFog { @@ -103,13 +109,9 @@ pub fn prepare_fog( // This is later read by `SetMeshViewBindGroup` commands.entity(entity).insert(ViewFogUniformOffset { - offset: fog_meta.gpu_fogs.push(gpu_fog), + offset: writer.write(&gpu_fog), }); } - - fog_meta - .gpu_fogs - .write_buffer(&render_device, &render_queue); } /// Inserted on each `Entity` with an `ExtractedView` to keep track of its offset diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs index bad686b92f2f9..c476afc76e1f6 100644 --- a/crates/bevy_pbr/src/render/light.rs +++ b/crates/bevy_pbr/src/render/light.rs @@ -667,7 +667,15 @@ pub fn prepare_lights( point_lights: Query<(Entity, &ExtractedPointLight)>, directional_lights: Query<(Entity, &ExtractedDirectionalLight)>, ) { - light_meta.view_gpu_lights.clear(); + let views_iter = views.iter(); + let views_count = views_iter.len(); + let Some(mut view_gpu_lights_writer) = + light_meta + .view_gpu_lights + .get_writer(views_count, &render_device, &render_queue) + else { + return; + }; // Pre-calculate for PointLights let cube_face_projection = @@ -1198,14 +1206,10 @@ pub fn prepare_lights( lights: view_lights, }, ViewLightsUniformOffset { - offset: light_meta.view_gpu_lights.push(gpu_lights), + offset: view_gpu_lights_writer.write(&gpu_lights), }, )); } - - light_meta - .view_gpu_lights - .write_buffer(&render_device, &render_queue); } // this must match CLUSTER_COUNT_SIZE in pbr.wgsl diff --git a/crates/bevy_render/src/extract_component.rs b/crates/bevy_render/src/extract_component.rs index df555d392a96b..4c6e4f8553928 100644 --- a/crates/bevy_render/src/extract_component.rs +++ b/crates/bevy_render/src/extract_component.rs @@ -132,24 +132,27 @@ fn prepare_uniform_components( ) where C: ShaderType + WriteInto + Clone, { - component_uniforms.uniforms.clear(); - let entities = components - .iter() + let components_iter = components.iter(); + let count = components_iter.len(); + let Some(mut writer) = + component_uniforms + .uniforms + .get_writer(count, &render_device, &render_queue) + else { + return; + }; + let entities = components_iter .map(|(entity, component)| { ( entity, DynamicUniformIndex:: { - index: component_uniforms.uniforms.push(component.clone()), + index: writer.write(component), marker: PhantomData, }, ) }) .collect::>(); commands.insert_or_spawn_batch(entities); - - component_uniforms - .uniforms - .write_buffer(&render_device, &render_queue); } /// This plugin extracts the components into the "render world". diff --git a/crates/bevy_render/src/render_resource/uniform_buffer.rs b/crates/bevy_render/src/render_resource/uniform_buffer.rs index 4c1ad61b2aeb8..3876866b639d3 100644 --- a/crates/bevy_render/src/render_resource/uniform_buffer.rs +++ b/crates/bevy_render/src/render_resource/uniform_buffer.rs @@ -1,14 +1,17 @@ -use std::marker::PhantomData; +use std::{marker::PhantomData, num::NonZeroU64}; use crate::{ render_resource::Buffer, renderer::{RenderDevice, RenderQueue}, }; use encase::{ - internal::WriteInto, DynamicUniformBuffer as DynamicUniformBufferWrapper, ShaderType, + internal::{AlignmentValue, BufferMut, WriteInto}, + DynamicUniformBuffer as DynamicUniformBufferWrapper, ShaderType, UniformBuffer as UniformBufferWrapper, }; -use wgpu::{util::BufferInitDescriptor, BindingResource, BufferBinding, BufferUsages}; +use wgpu::{ + util::BufferInitDescriptor, BindingResource, BufferBinding, BufferDescriptor, BufferUsages, +}; /// Stores data to be transferred to the GPU and made accessible to shaders as a uniform buffer. /// @@ -240,6 +243,67 @@ impl DynamicUniformBuffer { self.changed = true; } + /// Creates a writer that can be used to directly write elements into the target buffer. + /// + /// This method uses less memory and performs fewer memory copies using over [`push`] and [`write_buffer`]. + /// + /// `max_count` *must* be greater than or equal to the number of elements that are to be written to the buffer, or + /// the writer will panic while writing. Dropping the writer will schedule the buffer write into the provided + /// [`RenderQueue`](crate::renderer::RenderQueue). + /// + /// If there is no GPU-side buffer allocated to hold the data currently stored, or if a GPU-side buffer previously + /// allocated does not have enough capacity to hold `max_count` elements, a new GPU-side buffer is created. + /// + /// Returns `None` if there is no allocated GPU-side buffer, and `max_count` is 0. + /// + /// [`push`]: Self::push + /// [`write_buffer`]: Self::write_buffer + #[inline] + pub fn get_writer<'a>( + &'a mut self, + max_count: usize, + device: &RenderDevice, + queue: &'a RenderQueue, + ) -> Option> { + let alignment = + AlignmentValue::new(device.limits().min_uniform_buffer_offset_alignment as u64); + let mut capacity = self.buffer.as_deref().map(wgpu::Buffer::size).unwrap_or(0); + let size = alignment + .round_up(T::min_size().get()) + .checked_mul(max_count as u64) + .unwrap(); + + if capacity < size || self.changed { + let buffer = device.create_buffer(&BufferDescriptor { + label: self.label.as_deref(), + usage: self.buffer_usage, + size, + mapped_at_creation: false, + }); + capacity = buffer.size(); + self.buffer = Some(buffer); + self.changed = false; + } + + if let Some(buffer) = self.buffer.as_deref() { + let buffer_view = queue + .write_buffer_with(buffer, 0, NonZeroU64::new(buffer.size())?) + .unwrap(); + Some(DynamicUniformBufferWriter { + buffer: encase::DynamicUniformBuffer::new_with_alignment( + QueueWriteBufferViewWrapper { + capacity: capacity as usize, + buffer_view, + }, + alignment.get(), + ), + _marker: PhantomData, + }) + } else { + None + } + } + /// Queues writing of data from system RAM to VRAM using the [`RenderDevice`](crate::renderer::RenderDevice) /// and the provided [`RenderQueue`](crate::renderer::RenderQueue). /// @@ -268,3 +332,38 @@ impl DynamicUniformBuffer { self.scratch.set_offset(0); } } + +/// A writer that can be used to directly write elements into the target buffer. +/// +/// For more information, see [`DynamicUniformBuffer::get_writer`]. +pub struct DynamicUniformBufferWriter<'a, T> { + buffer: encase::DynamicUniformBuffer>, + _marker: PhantomData T>, +} + +impl<'a, T: ShaderType + WriteInto> DynamicUniformBufferWriter<'a, T> { + pub fn write(&mut self, value: &T) -> u32 { + self.buffer.write(value).unwrap() as u32 + } +} + +/// A wrapper to work around the orphan rule so that [`wgpu::QueueWriteBufferView`] can implement +/// [`encase::internal::BufferMut`]. +struct QueueWriteBufferViewWrapper<'a> { + buffer_view: wgpu::QueueWriteBufferView<'a>, + // Must be kept separately and cannot be retrieved from buffer_view, as the read-only access will + // invoke a panic. + capacity: usize, +} + +impl<'a> BufferMut for QueueWriteBufferViewWrapper<'a> { + #[inline] + fn capacity(&self) -> usize { + self.capacity + } + + #[inline] + fn write(&mut self, offset: usize, val: &[u8; N]) { + self.buffer_view.write(offset, val); + } +} diff --git a/crates/bevy_render/src/view/mod.rs b/crates/bevy_render/src/view/mod.rs index 012a62f76d80c..9759684148324 100644 --- a/crates/bevy_render/src/view/mod.rs +++ b/crates/bevy_render/src/view/mod.rs @@ -356,8 +356,15 @@ pub fn prepare_view_uniforms( Option<&MipBias>, )>, ) { - view_uniforms.uniforms.clear(); - + let view_iter = views.iter(); + let view_count = view_iter.len(); + let Some(mut writer) = + view_uniforms + .uniforms + .get_writer(view_count, &render_device, &render_queue) + else { + return; + }; for (entity, camera, temporal_jitter, mip_bias) in &views { let viewport = camera.viewport.as_vec4(); let unjittered_projection = camera.projection; @@ -380,7 +387,7 @@ pub fn prepare_view_uniforms( }; let view_uniforms = ViewUniformOffset { - offset: view_uniforms.uniforms.push(ViewUniform { + offset: writer.write(&ViewUniform { view_proj, unjittered_view_proj: unjittered_projection * inverse_view, inverse_view_proj: view * inverse_projection, @@ -397,10 +404,6 @@ pub fn prepare_view_uniforms( commands.entity(entity).insert(view_uniforms); } - - view_uniforms - .uniforms - .write_buffer(&render_device, &render_queue); } #[derive(Clone)]