From 81342b6d27701076ced180357b66fef84f66e4da Mon Sep 17 00:00:00 2001
From: mitchmindtree <mitchell.nordine@gmail.com>
Date: Mon, 20 Sep 2021 17:48:20 +1000
Subject: [PATCH] Expose `wgpu` features. Use `Queue::write_texture` in
 constructors.

Adds a collection of features to `nannou_wgpu` that match those exposed
by `wgpu` itself. The `spirv` feature in particular will be useful for
users still using `.spv` shaders (as opposed to the new default `.wgsl`
format).

Rewrites the `Texture` short-hand constructors to use
`Queue::write_texture` rather than using copy commands and submitting
command buffers directly. This is to avoid an Intel driver bug where
submitting more than one command buffer per frame appears to be causing
issues:

https://github.com/gfx-rs/wgpu/issues/1672#issuecomment-917510810

While this likely means consuming more RAM, it also likely results in
slightly better performance due to reducing the number of command
buffers submitted.
---
 nannou_wgpu/Cargo.toml                       |   7 +-
 nannou_wgpu/src/lib.rs                       |  23 +++
 nannou_wgpu/src/texture/image.rs             | 149 ++++++++++++++++---
 nannou_wgpu/src/texture/row_padded_buffer.rs |   7 +-
 4 files changed, 158 insertions(+), 28 deletions(-)
diff --git a/nannou_wgpu/Cargo.toml b/nannou_wgpu/Cargo.toml
index 29c7ca0f8..92dc509db 100644
--- a/nannou_wgpu/Cargo.toml
+++ b/nannou_wgpu/Cargo.toml
@@ -18,6 +18,11 @@ wgpu_upstream = { version = "0.10", package = "wgpu" }
 
 [features]
 capturer = ["image", "instant", "num_cpus"]
+replay = ["wgpu_upstream/replay"]
+serde = ["wgpu_upstream/serde"]
+spirv = ["wgpu_upstream/spirv"]
+trace = ["wgpu_upstream/trace"]
+webgl = ["wgpu_upstream/webgl"]
 
 [package.metadata.docs.rs]
-features = ["image", "capturer"]
+features = ["capturer", "image", "replay", "serde", "spirv", "trace", "webgl"]
diff --git a/nannou_wgpu/src/lib.rs b/nannou_wgpu/src/lib.rs
index 6ee084f2f..97686f4c4 100644
--- a/nannou_wgpu/src/lib.rs
+++ b/nannou_wgpu/src/lib.rs
@@ -106,6 +106,20 @@ pub const DEFAULT_POWER_PREFERENCE: PowerPreference = PowerPreference::HighPerfo
 /// Nannou's default WGPU backend preferences.
 pub const DEFAULT_BACKENDS: Backends = Backends::PRIMARY;
 
+/// Create a wgpu shader module from the given slice of SPIR-V bytes.
+#[cfg(feature = "spirv")]
+pub fn shader_from_spirv_bytes(
+    device: &wgpu_upstream::Device,
+    bytes: &[u8],
+) -> wgpu_upstream::ShaderModule {
+    let source = util::make_spirv(bytes);
+    let desc = ShaderModuleDescriptor {
+        label: Some("nannou_shader_module"),
+        source,
+    };
+    device.create_shader_module(&desc)
+}
+
 /// Adds a simple render pass command to the given encoder that simply clears the given texture
 /// with the given colour.
 ///
@@ -181,6 +195,15 @@ pub fn sampler_filtering(desc: &SamplerDescriptor) -> bool {
     }
 }
 
+/// Given the initial number of bytes per row within an image, compute the number of bytes that
+/// must be added per row to produce a valid bytes per row alignment.
+///
+/// See here:
+/// https://docs.rs/wgpu/latest/wgpu/struct.ImageDataLayout.html#structfield.bytes_per_row
+pub fn compute_row_padding(bytes_per_row: u32) -> u32 {
+    COPY_BYTES_PER_ROW_ALIGNMENT - (bytes_per_row % COPY_BYTES_PER_ROW_ALIGNMENT)
+}
+
 /// The functions within this module use unsafe in order to retrieve their input as a slice of
 /// bytes. This is necessary in order to upload data to the GPU via the wgpu
 /// `DeviceExt::create_buffer_init` buffer constructor. This method is unsafe as the type `T` may contain
diff --git a/nannou_wgpu/src/texture/image.rs b/nannou_wgpu/src/texture/image.rs
index 18739a53f..c2ccf9ac6 100644
--- a/nannou_wgpu/src/texture/image.rs
+++ b/nannou_wgpu/src/texture/image.rs
@@ -93,7 +93,7 @@ impl wgpu::Texture {
     ///
     /// If the `&App` is passed as the `src`, the window returned via `app.main_window()` will be
     /// used as the source of the device and queue.
-    pub fn from_path<'a, T, P>(src: T, path: P) -> image::ImageResult<Self>
+    pub fn from_path<T, P>(src: T, path: P) -> image::ImageResult<Self>
     where
         T: WithDeviceQueuePair,
         P: AsRef<Path>,
@@ -123,7 +123,7 @@ impl wgpu::Texture {
     ///
     /// The `DeviceQueuePairSource` can be either the `App`, a `Window`, a `DeviceQueuePair` or a
     /// tuple `(&Device, &Queue)`.
-    pub fn from_image<'a, T>(src: T, image: &image::DynamicImage) -> Self
+    pub fn from_image<T>(src: T, image: &image::DynamicImage) -> Self
     where
         T: WithDeviceQueuePair,
     {
@@ -498,6 +498,10 @@ where
 
 /// Load a texture directly from a dynamic image.
 ///
+/// This uses the `Queue::write_texture` method, meaning that the texture is not immediately
+/// written. Rather, the write is enqueued internally and scheduled to happen at the start of the
+/// next call to `Queue::submit`.
+///
 /// If the image is already in a format supported by wgpu, no conversions are performed and the
 /// image is loaded directly as-is with a texture format that matches the original image color
 /// type.
@@ -510,17 +514,37 @@ pub fn load_texture_from_image(
     usage: wgpu::TextureUsages,
     image: &image::DynamicImage,
 ) -> wgpu::Texture {
-    let cmd_encoder_desc = wgpu::CommandEncoderDescriptor {
-        label: Some("nannou_texture_from_image"),
-    };
-    let mut encoder = device.create_command_encoder(&cmd_encoder_desc);
-    let texture = encode_load_texture_from_image(device, &mut encoder, usage, image);
-    queue.submit(std::iter::once(encoder.finish()));
-    texture
+    use image::DynamicImage::*;
+    match image {
+        ImageLuma8(img) => load_texture_from_image_buffer(device, queue, usage, img),
+        ImageLumaA8(img) => load_texture_from_image_buffer(device, queue, usage, img),
+        ImageRgba8(img) => load_texture_from_image_buffer(device, queue, usage, img),
+        ImageBgra8(img) => load_texture_from_image_buffer(device, queue, usage, img),
+        ImageLuma16(img) => load_texture_from_image_buffer(device, queue, usage, img),
+        ImageLumaA16(img) => load_texture_from_image_buffer(device, queue, usage, img),
+        ImageRgba16(img) => load_texture_from_image_buffer(device, queue, usage, img),
+        ImageRgb8(_img) => {
+            let img = image.to_rgba8();
+            load_texture_from_image_buffer(device, queue, usage, &img)
+        }
+        ImageBgr8(_img) => {
+            let img = image.to_bgra8();
+            load_texture_from_image_buffer(device, queue, usage, &img)
+        }
+        ImageRgb16(_img) => {
+            // TODO: I think we lose some quality here - e.g. 16-bit channels down to 8-bit??.
+            let img = image.to_rgba8();
+            load_texture_from_image_buffer(device, queue, usage, &img)
+        }
+    }
 }
 
 /// Load a texture directly from an image buffer using the given device queue.
 ///
+/// This uses the `Queue::write_texture` method, meaning that the texture is not immediately
+/// written. Rather, the write is enqueued internally and scheduled to happen at the start of the
+/// next call to `Queue::submit`.
+///
 /// No format or size conversions are performed - the given buffer is loaded directly into GPU
 /// memory.
 ///
@@ -535,12 +559,38 @@ where
     P: 'static + Pixel,
     Container: std::ops::Deref<Target = [P::Subpixel]>,
 {
-    let cmd_encoder_desc = wgpu::CommandEncoderDescriptor {
-        label: Some("nannou_texture_from_image_buffer"),
+    // Create the texture.
+    let texture = wgpu::TextureBuilder::from_image_view(buffer)
+        .usage(wgpu::TextureBuilder::REQUIRED_IMAGE_TEXTURE_USAGE | usage)
+        .build(device);
+
+    // Describe the layout of the data.
+    let extent = texture.extent();
+    let format = texture.format();
+    let block_size = format.describe().block_size;
+    let bytes_per_row = extent.width * block_size as u32;
+    let image_data_layout = wgpu::ImageDataLayout {
+        offset: 0,
+        bytes_per_row: std::num::NonZeroU32::new(bytes_per_row),
+        rows_per_image: None,
+    };
+
+    // Copy into the entire texture.
+    let image_copy_texture = wgpu::ImageCopyTexture {
+        texture: texture.inner(),
+        mip_level: 0,
+        origin: wgpu::Origin3d::ZERO,
+        // TODO: Maybe we shouldn't assume this?
+        aspect: wgpu::TextureAspect::All,
     };
-    let mut encoder = device.create_command_encoder(&cmd_encoder_desc);
-    let texture = encode_load_texture_from_image_buffer(device, &mut encoder, usage, buffer);
-    queue.submit(std::iter::once(encoder.finish()));
+
+    // TODO:
+    // This can theoretically be exploited by implementing our `image::Pixel` trait for some type
+    // that has padding. Perhaps it should be an unsafe trait? Should investigate how to achieve
+    // this in a safer manner.
+    let data = unsafe { wgpu::bytes::from_slice(&*buffer) };
+
+    queue.write_texture(image_copy_texture, data, image_data_layout, extent);
     texture
 }
 
@@ -564,14 +614,71 @@ where
     P: 'static + Pixel,
     Container: 'a + std::ops::Deref<Target = [P::Subpixel]>,
 {
-    let cmd_encoder_desc = wgpu::CommandEncoderDescriptor {
-        label: Some("nannou_load_3d_texture_from_image_buffers"),
+    let mut buffers = buffers.into_iter();
+    let array_layers = buffers.len() as u32;
+    let first_buffer = buffers.next()?;
+
+    // Build the texture ready to receive the data.
+    let (width, height) = first_buffer.dimensions();
+    let extent = wgpu::Extent3d {
+        width,
+        height,
+        depth_or_array_layers: array_layers,
     };
-    let mut encoder = device.create_command_encoder(&cmd_encoder_desc);
-    let texture =
-        encode_load_texture_array_from_image_buffers(device, &mut encoder, usage, buffers);
-    queue.submit(std::iter::once(encoder.finish()));
-    texture
+    let texture = wgpu::TextureBuilder::from_image_view(first_buffer)
+        .extent(extent)
+        .dimension(wgpu::TextureDimension::D2) // force an array
+        .usage(wgpu::TextureBuilder::REQUIRED_IMAGE_TEXTURE_USAGE | usage)
+        .build(device);
+
+    // Describe the layout of the data.
+    let format = texture.format();
+    let block_size = format.describe().block_size;
+    let bytes_per_row = extent.width * block_size as u32;
+    let padding = wgpu::compute_row_padding(bytes_per_row);
+    let padded_bytes_per_row = bytes_per_row + padding;
+    let image_data_layout = wgpu::ImageDataLayout {
+        offset: 0,
+        bytes_per_row: std::num::NonZeroU32::new(padded_bytes_per_row),
+        rows_per_image: std::num::NonZeroU32::new(height),
+    };
+
+    // Collect the data into a single slice, padding each row as necessary.
+    //
+    // NOTE: Previously we used `encode_load_texture_array_from_image_buffers` which avoids
+    // collecting the image data into a single slice. However, the `wgpu::Texture::from_*`
+    // constructors have been changed to avoid submitting an extra command buffer in favour
+    // of using `Queue::write_texture` which schedules the write for the next call to
+    // `Queue::submit`. This is to avoid an Intel driver bug where submitting more than one command
+    // buffer per frame appears to be causing issues:
+    // https://github.com/gfx-rs/wgpu/issues/1672#issuecomment-917510810
+    //
+    // While this likely means consuming more RAM, it also likely results in slightly better
+    // performance due to reducing the number of command buffers submitted.
+    //
+    // Users can still use `encode_load_texture_array_from_image_buffers` directly if they wish.
+    let capacity = bytes_per_row as usize * height as usize * array_layers as usize;
+    let mut data: Vec<u8> = Vec::with_capacity(capacity);
+    for buffer in Some(first_buffer).into_iter().chain(buffers) {
+        let layer_data = unsafe { wgpu::bytes::from_slice(&*buffer) };
+        for row_data in layer_data.chunks(bytes_per_row as usize) {
+            data.extend_from_slice(row_data);
+            data.extend((0..padding).map(|_| 0u8));
+        }
+    }
+
+    // Copy into the entire texture.
+    let image_copy_texture = wgpu::ImageCopyTexture {
+        texture: texture.inner(),
+        mip_level: 0,
+        origin: wgpu::Origin3d::ZERO,
+        // TODO: Maybe we shouldn't assume this?
+        aspect: wgpu::TextureAspect::All,
+    };
+
+    queue.write_texture(image_copy_texture, &data, image_data_layout, extent);
+
+    Some(texture)
 }
 
 /// Encode the necessary commands to load a texture directly from a dynamic image.
diff --git a/nannou_wgpu/src/texture/row_padded_buffer.rs b/nannou_wgpu/src/texture/row_padded_buffer.rs
index 7233794ce..e76764783 100644
--- a/nannou_wgpu/src/texture/row_padded_buffer.rs
+++ b/nannou_wgpu/src/texture/row_padded_buffer.rs
@@ -30,7 +30,7 @@ impl RowPaddedBuffer {
     ///
     /// Width should be given in bytes.
     pub fn new(device: &wgpu::Device, width: u32, height: u32, usage: wgpu::BufferUsages) -> Self {
-        let row_padding = Self::compute_row_padding(width);
+        let row_padding = wgpu::compute_row_padding(width);
 
         // only create mapped for buffers that we're going to write to.
         let mapped_at_creation = usage.contains(wgpu::BufferUsages::MAP_WRITE);
@@ -66,11 +66,6 @@ impl RowPaddedBuffer {
         )
     }
 
-    /// Compute the necessary padding for each row.
-    fn compute_row_padding(width: u32) -> u32 {
-        wgpu::COPY_BYTES_PER_ROW_ALIGNMENT - (width % wgpu::COPY_BYTES_PER_ROW_ALIGNMENT)
-    }
-
     /// The width of the buffer, in bytes, NOT including padding bytes.
     pub fn width(&self) -> u32 {
         self.width