wgpu: Allow unaligned writes to IndexBuffer3D

wgpu requires buffer copy sizes and offsets to be 4-byte aligned. Unfortunately, ActionScript can perform 2-byte aligned uploads into an IndexBuffer3D. To support this, we now keep a copy of the IndexBuffer3D on the CPU. When performing an upload to the buffer, we round the offset down and the size up to the nearest 4-byte aligned value. The cpu buffer is used to fill out the write with existing data, so that we don't corrupt the contents of the GPU buffer. To avoid introducing a new RefCell, I've changed IndexBuffer3D to use a `Box` instead of an `Rc` to store the trait object. This allows us to pass a mutable reference down to the backend.
ruffle-rs · May 10, 2023 · e488cc9 · e488cc9
1 parent ea78bf2
commit e488cc9
Show file tree

Hide file tree

Showing 12 changed files with 1,027 additions and 33 deletions.
diff --git a/core/src/avm2/globals/flash/display3D/index_buffer_3d.rs b/core/src/avm2/globals/flash/display3D/index_buffer_3d.rs
@@ -62,10 +62,6 @@ pub fn upload_from_vector<'gc>(
 
         index_buffer.set_count(count as usize, activation.context.gc_context);
 
-        if start_offset != 0 {
-            panic!("What exactly does start_offset do?");
-        }
-
         let data: Result<Vec<u16>, _> = vector
             .iter()
             .map(|val| {

diff --git a/core/src/avm2/object/context3d_object.rs b/core/src/avm2/object/context3d_object.rs
@@ -182,14 +182,15 @@ impl<'gc> Context3DObject<'gc> {
         start_offset: usize,
         activation: &mut Activation<'_, 'gc>,
     ) {
+        let mut handle = buffer.handle(activation.context.gc_context);
         self.0
             .write(activation.context.gc_context)
             .render_context
             .as_mut()
             .unwrap()
             .process_command(
                 Context3DCommand::UploadToIndexBuffer {
-                    buffer: buffer.handle(),
+                    buffer: &mut *handle,
                     data,
                     start_offset,
                 },
@@ -279,6 +280,7 @@ impl<'gc> Context3DObject<'gc> {
             // FIXME - should we error if the number of indices isn't a multiple of 3?
             num_triangles = (index_buffer.count() / 3) as i32;
         }
+        let handle = index_buffer.handle(activation.context.gc_context);
 
         self.0
             .write(activation.context.gc_context)
@@ -287,7 +289,7 @@ impl<'gc> Context3DObject<'gc> {
             .unwrap()
             .process_command(
                 Context3DCommand::DrawTriangles {
-                    index_buffer: index_buffer.handle(),
+                    index_buffer: &*handle,
                     first_index: first_index as usize,
                     num_triangles: num_triangles as isize,
                 },

diff --git a/core/src/avm2/object/index_buffer_3d_object.rs b/core/src/avm2/object/index_buffer_3d_object.rs
@@ -8,7 +8,6 @@ use crate::avm2::Error;
 use gc_arena::{Collect, GcCell, MutationContext};
 use ruffle_render::backend::IndexBuffer;
 use std::cell::{Ref, RefMut};
-use std::rc::Rc;
 
 use super::Context3DObject;
 
@@ -20,7 +19,7 @@ impl<'gc> IndexBuffer3DObject<'gc> {
     pub fn from_handle(
         activation: &mut Activation<'_, 'gc>,
         context3d: Context3DObject<'gc>,
-        handle: Rc<dyn IndexBuffer>,
+        handle: Box<dyn IndexBuffer>,
     ) -> Result<Object<'gc>, Error<'gc>> {
         let class = activation.avm2().classes().indexbuffer3d;
         let base = ScriptObjectData::new(class);
@@ -50,8 +49,8 @@ impl<'gc> IndexBuffer3DObject<'gc> {
         self.0.write(mc).count = val;
     }
 
-    pub fn handle(&self) -> Rc<dyn IndexBuffer> {
-        self.0.read().handle.clone()
+    pub fn handle(&self, mc: MutationContext<'gc, '_>) -> RefMut<'_, dyn IndexBuffer> {
+        RefMut::map(self.0.write(mc), |data| &mut *data.handle)
     }
 
     pub fn context3d(&self) -> Context3DObject<'gc> {
@@ -65,7 +64,7 @@ pub struct IndexBuffer3DObjectData<'gc> {
     /// Base script object
     base: ScriptObjectData<'gc>,
 
-    handle: Rc<dyn IndexBuffer>,
+    handle: Box<dyn IndexBuffer>,
 
     count: usize,
 

diff --git a/render/src/backend.rs b/render/src/backend.rs
@@ -192,7 +192,8 @@ pub trait Context3D: Collect + Downcast {
     // objects after dispose() has been called.
     fn disposed_vertex_buffer_handle(&self) -> Rc<dyn VertexBuffer>;
 
-    fn create_index_buffer(&mut self, usage: BufferUsage, num_indices: u32) -> Rc<dyn IndexBuffer>;
+    fn create_index_buffer(&mut self, usage: BufferUsage, num_indices: u32)
+        -> Box<dyn IndexBuffer>;
     fn create_vertex_buffer(
         &mut self,
         usage: BufferUsage,
@@ -218,7 +219,7 @@ pub trait Context3D: Collect + Downcast {
 
     fn process_command<'gc>(
         &mut self,
-        command: Context3DCommand<'gc>,
+        command: Context3DCommand<'_, 'gc>,
         mc: MutationContext<'gc, '_>,
     );
 }
@@ -338,7 +339,7 @@ impl Context3DTextureFilter {
 
 #[derive(Collect)]
 #[collect(no_drop)]
-pub enum Context3DCommand<'gc> {
+pub enum Context3DCommand<'a, 'gc> {
     Clear {
         red: f64,
         green: f64,
@@ -365,7 +366,7 @@ pub enum Context3DCommand<'gc> {
     SetRenderToBackBuffer,
 
     UploadToIndexBuffer {
-        buffer: Rc<dyn IndexBuffer>,
+        buffer: &'a mut dyn IndexBuffer,
         start_offset: usize,
         data: Vec<u8>,
     },
@@ -378,7 +379,7 @@ pub enum Context3DCommand<'gc> {
     },
 
     DrawTriangles {
-        index_buffer: Rc<dyn IndexBuffer>,
+        index_buffer: &'a dyn IndexBuffer,
         first_index: usize,
         num_triangles: isize,
     },

diff --git a/render/wgpu/src/context3d/mod.rs b/render/wgpu/src/context3d/mod.rs
@@ -298,7 +298,12 @@ impl WgpuContext3D {
 
 #[derive(Collect)]
 #[collect(require_static)]
-pub struct IndexBufferWrapper(wgpu::Buffer);
+pub struct IndexBufferWrapper {
+    pub buffer: wgpu::Buffer,
+    /// A cpu-side copy of the buffer data. This is used to allow us to
+    /// perform unaligned writes to the GPU buffer, which is required by ActionScript.
+    pub data: Vec<u8>,
+}
 
 #[derive(Collect, Debug)]
 #[collect(require_static)]
@@ -347,14 +352,18 @@ impl Context3D for WgpuContext3D {
         &mut self,
         _ruffle_usage: ruffle_render::backend::BufferUsage,
         num_indices: u32,
-    ) -> Rc<dyn IndexBuffer> {
+    ) -> Box<dyn IndexBuffer> {
+        let size = align_copy_buffer_size(num_indices as usize * std::mem::size_of::<u16>()) as u32;
         let buffer = self.descriptors.device.create_buffer(&BufferDescriptor {
             label: None,
-            size: num_indices as u64 * 2,
+            size: size as u64,
             usage: BufferUsages::INDEX | BufferUsages::COPY_DST,
             mapped_at_creation: false,
         });
-        Rc::new(IndexBufferWrapper(buffer))
+        Box::new(IndexBufferWrapper {
+            buffer,
+            data: vec![0; size as usize],
+        })
     }
 
     fn create_vertex_buffer(
@@ -459,7 +468,7 @@ impl Context3D for WgpuContext3D {
 
     fn process_command<'gc>(
         &mut self,
-        command: Context3DCommand<'gc>,
+        command: Context3DCommand<'_, 'gc>,
         mc: MutationContext<'gc, '_>,
     ) {
         match command {
@@ -645,20 +654,40 @@ impl Context3D for WgpuContext3D {
                 start_offset,
                 data,
             } => {
-                let buffer: &IndexBufferWrapper = buffer
-                    .as_any()
-                    .downcast_ref::<IndexBufferWrapper>()
+                if data.is_empty() {
+                    return;
+                }
+                let buffer: &mut IndexBufferWrapper = buffer
+                    .as_any_mut()
+                    .downcast_mut::<IndexBufferWrapper>()
                     .unwrap();
 
+                // Unfortunately, ActionScript works with 2-byte indices, while wgpu requires
+                // copy offsets and sizes to have 4-byte alignment. To support this, we need
+                // to keep a copy of the data on the CPU side. We round *down* the offset to
+                // the closest multiple of 4 bytes, and round *up* the length to the closest
+                // multiple of 4 bytes. We then perform a copy from our CPU-side buffer, which
+                // which uses the existing data (at the beiginning or end) to fill out the copy
+                // to the required length and offset. Without this, we would lose data in the CPU
+                // buffer whenever we performed a copy with an unalignd offset or length.
+                let offset_bytes = start_offset * std::mem::size_of::<u16>();
+                let rounded_down_offset =
+                    offset_bytes - (offset_bytes % COPY_BUFFER_ALIGNMENT as usize);
+                let rounded_up_length = align_copy_buffer_size(data.len());
+
+                buffer.data[offset_bytes..(offset_bytes + data.len())].copy_from_slice(&data);
                 self.buffer_staging_belt
                     .write_buffer(
                         &mut self.buffer_command_encoder,
-                        &buffer.0,
-                        (start_offset * std::mem::size_of::<u16>()) as u64,
-                        NonZeroU64::new(data.len() as u64).unwrap(),
+                        &buffer.buffer,
+                        rounded_down_offset as u64,
+                        NonZeroU64::new(rounded_up_length as u64).unwrap(),
                         &self.descriptors.device,
                     )
-                    .copy_from_slice(&data);
+                    .copy_from_slice(
+                        &buffer.data
+                            [rounded_down_offset..(rounded_down_offset + rounded_up_length)],
+                    );
             }
 
             Context3DCommand::UploadToVertexBuffer {
@@ -667,21 +696,24 @@ impl Context3D for WgpuContext3D {
                 data32_per_vertex,
                 data,
             } => {
+                if data.is_empty() {
+                    return;
+                }
+
                 let buffer: Rc<VertexBufferWrapper> = buffer
                     .clone()
                     .into_any_rc()
                     .downcast::<VertexBufferWrapper>()
                     .unwrap();
 
-                let align = COPY_BUFFER_ALIGNMENT as usize;
-                let rounded_size = (data.len() + align - 1) & !(align - 1);
-
+                // ActionScript can only work with 32-bit chunks of data, so our `write_buffer`
+                // offset and size will always be a multiple of `COPY_BUFFER_ALIGNMENT` (4 bytes)
                 self.buffer_staging_belt.write_buffer(
                     &mut self.buffer_command_encoder,
                     &buffer.buffer,
                     (start_vertex * (data32_per_vertex as usize) * std::mem::size_of::<f32>())
                         as u64,
-                    NonZeroU64::new(rounded_size as u64).unwrap(),
+                    NonZeroU64::new(data.len() as u64).unwrap(),
                     &self.descriptors.device,
                 )[..data.len()]
                     .copy_from_slice(&data);
@@ -805,7 +837,8 @@ impl Context3D for WgpuContext3D {
 
                 let mut render_pass = self.make_render_pass(&mut render_command_encoder);
 
-                render_pass.set_index_buffer(index_buffer.0.slice(..), wgpu::IndexFormat::Uint16);
+                render_pass
+                    .set_index_buffer(index_buffer.buffer.slice(..), wgpu::IndexFormat::Uint16);
                 render_pass.draw_indexed(indices, 0, 0..1);
 
                 // A `RenderPass` needs to hold references to several fields in `self`, so we can't
@@ -1126,3 +1159,9 @@ fn convert_texture_format(input: Context3DTextureFormat) -> Result<wgpu::Texture
         )),
     }
 }
+
+// Rounds up 'len' to the nearest multiple of COPY_BUFFER_ALIGNMENT
+fn align_copy_buffer_size(len: usize) -> usize {
+    let align = COPY_BUFFER_ALIGNMENT as usize;
+    (len + align - 1) & !(align - 1)
+}