From d2809137bab1e40a3fe607505367183e5ed33f2a Mon Sep 17 00:00:00 2001 From: Teodor Tanasoaia <28601907+teoxoy@users.noreply.github.com> Date: Thu, 26 Jan 2023 00:28:35 +0100 Subject: [PATCH] Zero-initialize workgroup memory (#3174) fixes https://github.com/gfx-rs/wgpu/issues/2430 --- CHANGELOG.md | 3 +- Cargo.lock | 2 +- Cargo.toml | 2 +- wgpu-core/Cargo.toml | 2 +- wgpu-hal/Cargo.toml | 4 +- wgpu-hal/src/dx12/device.rs | 1 + wgpu-hal/src/gles/device.rs | 1 + wgpu-hal/src/metal/device.rs | 1 + wgpu-hal/src/vulkan/adapter.rs | 41 ++++ wgpu-hal/src/vulkan/mod.rs | 1 + wgpu/tests/shader/mod.rs | 1 + wgpu/tests/shader/zero_init_workgroup_mem.rs | 183 ++++++++++++++++++ .../tests/shader/zero_init_workgroup_mem.wgsl | 31 +++ 13 files changed, 267 insertions(+), 6 deletions(-) create mode 100644 wgpu/tests/shader/zero_init_workgroup_mem.rs create mode 100644 wgpu/tests/shader/zero_init_workgroup_mem.wgsl diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ba5a115aa..0cedadee06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -225,7 +225,8 @@ surface.configure(&device, &config); - Implemented correleation between user timestamps and platform specific presentation timestamps via [`Adapter::get_presentation_timestamp`]. By @cwfitzgerald in [#3240](https://github.com/gfx-rs/wgpu/pull/3240) - Added support for `Features::SHADER_PRIMITIVE_INDEX` on all backends. By @cwfitzgerald in [#3272](https://github.com/gfx-rs/wgpu/pull/3272) - Implemented `TextureFormat::Stencil8`, allowing for stencil testing without depth components. By @Dinnerbone in [#3343](https://github.com/gfx-rs/wgpu/pull/3343) -- Implemented `add_srgb_suffix()` for `TextureFormat` for converting linear formats to sRGB. By @Elabajaba in [#3419](https://github.com/gfx-rs/wgpu/pull/3419) +- Implemented `add_srgb_suffix()` for `TextureFormat` for converting linear formats to sRGB. By @Elabajaba in [#3419](https://github.com/gfx-rs/wgpu/pull/3419) +- Zero-initialize workgroup memory. By @teoxoy in [#3174](https://github.com/gfx-rs/wgpu/pull/3174) #### GLES diff --git a/Cargo.lock b/Cargo.lock index 32fe039822..e9cb6f9679 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1471,7 +1471,7 @@ dependencies = [ [[package]] name = "naga" version = "0.10.0" -source = "git+https://github.com/gfx-rs/naga?rev=1be8024#1be8024bda3594987b417bead5024b98be9ab521" +source = "git+https://github.com/gfx-rs/naga?rev=c7d02151f08d6285683795289b5725b827d836d1#c7d02151f08d6285683795289b5725b827d836d1" dependencies = [ "bit-set", "bitflags", diff --git a/Cargo.toml b/Cargo.toml index 9ede2a7826..9f97c0a640 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,7 +39,7 @@ path = "./wgpu-hal" [workspace.dependencies.naga] git = "https://github.com/gfx-rs/naga" -rev = "1be8024" +rev = "c7d02151f08d6285683795289b5725b827d836d1" version = "0.10" [workspace.dependencies] diff --git a/wgpu-core/Cargo.toml b/wgpu-core/Cargo.toml index 3fd13f2dd6..e4a128a2b4 100644 --- a/wgpu-core/Cargo.toml +++ b/wgpu-core/Cargo.toml @@ -67,7 +67,7 @@ thiserror = "1" [dependencies.naga] git = "https://github.com/gfx-rs/naga" -rev = "1be8024" +rev = "c7d02151f08d6285683795289b5725b827d836d1" version = "0.10" features = ["clone", "span", "validate"] diff --git a/wgpu-hal/Cargo.toml b/wgpu-hal/Cargo.toml index 45fbe29fc5..7589a807b9 100644 --- a/wgpu-hal/Cargo.toml +++ b/wgpu-hal/Cargo.toml @@ -113,14 +113,14 @@ android_system_properties = "0.1.1" [dependencies.naga] git = "https://github.com/gfx-rs/naga" -rev = "1be8024" +rev = "c7d02151f08d6285683795289b5725b827d836d1" version = "0.10" features = ["clone"] # DEV dependencies [dev-dependencies.naga] git = "https://github.com/gfx-rs/naga" -rev = "1be8024" +rev = "c7d02151f08d6285683795289b5725b827d836d1" version = "0.10" features = ["wgsl-in"] diff --git a/wgpu-hal/src/dx12/device.rs b/wgpu-hal/src/dx12/device.rs index fb2c3f92fc..213205fd70 100644 --- a/wgpu-hal/src/dx12/device.rs +++ b/wgpu-hal/src/dx12/device.rs @@ -1070,6 +1070,7 @@ impl crate::Device for super::Device { fake_missing_bindings: false, special_constants_binding, push_constants_target, + zero_initialize_workgroup_memory: true, }, }) } diff --git a/wgpu-hal/src/gles/device.rs b/wgpu-hal/src/gles/device.rs index ed8717df97..767a553769 100644 --- a/wgpu-hal/src/gles/device.rs +++ b/wgpu-hal/src/gles/device.rs @@ -1032,6 +1032,7 @@ impl crate::Device for super::Device { version: self.shared.shading_language_version, writer_flags, binding_map, + zero_initialize_workgroup_memory: true, }, }) } diff --git a/wgpu-hal/src/metal/device.rs b/wgpu-hal/src/metal/device.rs index 2a994bb579..8ba2702ee4 100644 --- a/wgpu-hal/src/metal/device.rs +++ b/wgpu-hal/src/metal/device.rs @@ -699,6 +699,7 @@ impl crate::Device for super::Device { // TODO: support bounds checks on binding arrays binding_array: naga::proc::BoundsCheckPolicy::Unchecked, }, + zero_initialize_workgroup_memory: true, }, total_push_constants, }) diff --git a/wgpu-hal/src/vulkan/adapter.rs b/wgpu-hal/src/vulkan/adapter.rs index 8423f74ebb..c86dbb9173 100644 --- a/wgpu-hal/src/vulkan/adapter.rs +++ b/wgpu-hal/src/vulkan/adapter.rs @@ -31,6 +31,8 @@ pub struct PhysicalDeviceFeatures { vk::PhysicalDeviceShaderFloat16Int8Features, vk::PhysicalDevice16BitStorageFeatures, )>, + zero_initialize_workgroup_memory: + Option, } // This is safe because the structs have `p_next: *mut c_void`, which we null out/never read. @@ -69,6 +71,9 @@ impl PhysicalDeviceFeatures { info = info.push_next(f16_i8_feature); info = info.push_next(_16bit_feature); } + if let Some(ref mut feature) = self.zero_initialize_workgroup_memory { + info = info.push_next(feature); + } info } @@ -286,6 +291,19 @@ impl PhysicalDeviceFeatures { } else { None }, + zero_initialize_workgroup_memory: if effective_api_version >= vk::API_VERSION_1_3 + || enabled_extensions.contains(&vk::KhrZeroInitializeWorkgroupMemoryFn::name()) + { + Some( + vk::PhysicalDeviceZeroInitializeWorkgroupMemoryFeatures::builder() + .shader_zero_initialize_workgroup_memory( + private_caps.zero_initialize_workgroup_memory, + ) + .build(), + ) + } else { + None + }, } } @@ -885,6 +903,16 @@ impl super::InstanceShared { builder = builder.push_next(&mut next.1); } + // `VK_KHR_zero_initialize_workgroup_memory` is promoted to 1.3 + if capabilities.effective_api_version >= vk::API_VERSION_1_3 + || capabilities.supports_extension(vk::KhrZeroInitializeWorkgroupMemoryFn::name()) + { + let next = features + .zero_initialize_workgroup_memory + .insert(vk::PhysicalDeviceZeroInitializeWorkgroupMemoryFeatures::default()); + builder = builder.push_next(next); + } + let mut features2 = builder.build(); unsafe { get_device_properties.get_physical_device_features2(phd, &mut features2); @@ -1044,6 +1072,11 @@ impl super::Instance { .image_robustness .map_or(false, |ext| ext.robust_image_access != 0), }, + zero_initialize_workgroup_memory: phd_features + .zero_initialize_workgroup_memory + .map_or(false, |ext| { + ext.shader_zero_initialize_workgroup_memory == vk::TRUE + }), }; let capabilities = crate::Capabilities { limits: phd_capabilities.to_wgpu_limits(), @@ -1246,6 +1279,14 @@ impl super::Adapter { // TODO: support bounds checks on binding arrays binding_array: naga::proc::BoundsCheckPolicy::Unchecked, }, + zero_initialize_workgroup_memory: if self + .private_caps + .zero_initialize_workgroup_memory + { + spv::ZeroInitializeWorkgroupMemoryMode::Native + } else { + spv::ZeroInitializeWorkgroupMemoryMode::Polyfill + }, // We need to build this separately for each invocation, so just default it out here binding_map: BTreeMap::default(), } diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs index 0eadb17499..3c2ebbb5e9 100644 --- a/wgpu-hal/src/vulkan/mod.rs +++ b/wgpu-hal/src/vulkan/mod.rs @@ -166,6 +166,7 @@ struct PrivateCapabilities { non_coherent_map_mask: wgt::BufferAddress, robust_buffer_access: bool, robust_image_access: bool, + zero_initialize_workgroup_memory: bool, } bitflags::bitflags!( diff --git a/wgpu/tests/shader/mod.rs b/wgpu/tests/shader/mod.rs index c1cca4e4d7..518b7f940b 100644 --- a/wgpu/tests/shader/mod.rs +++ b/wgpu/tests/shader/mod.rs @@ -17,6 +17,7 @@ use crate::common::TestingContext; mod numeric_builtins; mod struct_layout; +mod zero_init_workgroup_mem; #[derive(Clone, Copy, PartialEq)] enum InputStorageType { diff --git a/wgpu/tests/shader/zero_init_workgroup_mem.rs b/wgpu/tests/shader/zero_init_workgroup_mem.rs new file mode 100644 index 0000000000..da87ae1a18 --- /dev/null +++ b/wgpu/tests/shader/zero_init_workgroup_mem.rs @@ -0,0 +1,183 @@ +use std::num::NonZeroU64; + +use wgpu::{ + include_wgsl, Backends, BindGroupDescriptor, BindGroupEntry, BindGroupLayoutDescriptor, + BindGroupLayoutEntry, BindingResource, BindingType, BufferBinding, BufferBindingType, + BufferDescriptor, BufferUsages, CommandEncoderDescriptor, ComputePassDescriptor, + ComputePipelineDescriptor, DownlevelFlags, Limits, Maintain, MapMode, PipelineLayoutDescriptor, + ShaderStages, +}; + +use crate::common::{initialize_test, TestParameters, TestingContext}; + +#[test] +fn zero_init_workgroup_mem() { + initialize_test( + TestParameters::default() + .downlevel_flags(DownlevelFlags::COMPUTE_SHADERS) + .limits(Limits::downlevel_defaults()) + // remove once we get to https://github.com/gfx-rs/wgpu/issues/3193 or + // https://github.com/gfx-rs/wgpu/issues/3160 + .specific_failure( + Some(Backends::DX12), + Some(5140), + Some("Microsoft Basic Render Driver"), + true, + ) + // this one is flakey + .specific_failure( + Some(Backends::VULKAN), + Some(6880), + Some("SwiftShader"), + true, + ) + // TODO: investigate why it fails + .specific_failure(Some(Backends::GL), Some(65541), Some("llvmpipe"), false), + zero_init_workgroup_mem_impl, + ); +} + +const DISPATCH_SIZE: (u32, u32, u32) = (64, 64, 64); +const TOTAL_WORK_GROUPS: u32 = DISPATCH_SIZE.0 * DISPATCH_SIZE.1 * DISPATCH_SIZE.2; + +/// nr of bytes we use in the shader +const SHADER_WORKGROUP_MEMORY: u32 = 512 * 4 + 4; +// assume we have this much workgroup memory (2GB) +const MAX_DEVICE_WORKGROUP_MEMORY: u32 = i32::MAX as u32; +const NR_OF_DISPATCHES: u32 = + MAX_DEVICE_WORKGROUP_MEMORY / (SHADER_WORKGROUP_MEMORY * TOTAL_WORK_GROUPS) + 1; // TODO: use div_ceil once stabilized + +const OUTPUT_ARRAY_SIZE: u32 = TOTAL_WORK_GROUPS * NR_OF_DISPATCHES; +const BUFFER_SIZE: u64 = OUTPUT_ARRAY_SIZE as u64 * 4; +const BUFFER_BINDING_SIZE: u32 = TOTAL_WORK_GROUPS * 4; + +fn zero_init_workgroup_mem_impl(ctx: TestingContext) { + let bgl = ctx + .device + .create_bind_group_layout(&BindGroupLayoutDescriptor { + label: None, + entries: &[BindGroupLayoutEntry { + binding: 0, + visibility: ShaderStages::COMPUTE, + ty: BindingType::Buffer { + ty: BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: true, + min_binding_size: None, + }, + count: None, + }], + }); + + let output_buffer = ctx.device.create_buffer(&BufferDescriptor { + label: Some("output buffer"), + size: BUFFER_SIZE, + usage: BufferUsages::COPY_DST | BufferUsages::COPY_SRC | BufferUsages::STORAGE, + mapped_at_creation: false, + }); + + let mapping_buffer = ctx.device.create_buffer(&BufferDescriptor { + label: Some("mapping buffer"), + size: BUFFER_SIZE, + usage: BufferUsages::COPY_DST | BufferUsages::MAP_READ, + mapped_at_creation: false, + }); + + let bg = ctx.device.create_bind_group(&BindGroupDescriptor { + label: None, + layout: &bgl, + entries: &[BindGroupEntry { + binding: 0, + resource: BindingResource::Buffer(BufferBinding { + buffer: &output_buffer, + offset: 0, + size: Some(NonZeroU64::new(BUFFER_BINDING_SIZE as u64).unwrap()), + }), + }], + }); + + let pll = ctx + .device + .create_pipeline_layout(&PipelineLayoutDescriptor { + label: None, + bind_group_layouts: &[&bgl], + push_constant_ranges: &[], + }); + + let sm = ctx + .device + .create_shader_module(include_wgsl!("zero_init_workgroup_mem.wgsl")); + + let pipeline_read = ctx + .device + .create_compute_pipeline(&ComputePipelineDescriptor { + label: Some("pipeline read"), + layout: Some(&pll), + module: &sm, + entry_point: "read", + }); + + let pipeline_write = ctx + .device + .create_compute_pipeline(&ComputePipelineDescriptor { + label: Some("pipeline write"), + layout: None, + module: &sm, + entry_point: "write", + }); + + // -- Initializing data -- + + let output_pre_init_data = vec![1; OUTPUT_ARRAY_SIZE as usize]; + ctx.queue.write_buffer( + &output_buffer, + 0, + bytemuck::cast_slice(&output_pre_init_data), + ); + + // -- Run test -- + + let mut encoder = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor::default()); + + let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor::default()); + + cpass.set_pipeline(&pipeline_write); + for _ in 0..NR_OF_DISPATCHES { + cpass.dispatch_workgroups(DISPATCH_SIZE.0, DISPATCH_SIZE.1, DISPATCH_SIZE.2); + } + + cpass.set_pipeline(&pipeline_read); + for i in 0..NR_OF_DISPATCHES { + cpass.set_bind_group(0, &bg, &[i * BUFFER_BINDING_SIZE]); + cpass.dispatch_workgroups(DISPATCH_SIZE.0, DISPATCH_SIZE.1, DISPATCH_SIZE.2); + } + drop(cpass); + + // -- Pulldown data -- + + encoder.copy_buffer_to_buffer(&output_buffer, 0, &mapping_buffer, 0, BUFFER_SIZE); + + ctx.queue.submit(Some(encoder.finish())); + + mapping_buffer.slice(..).map_async(MapMode::Read, |_| ()); + ctx.device.poll(Maintain::Wait); + + let mapped = mapping_buffer.slice(..).get_mapped_range(); + + let typed: &[u32] = bytemuck::cast_slice(&*mapped); + + // -- Check results -- + + let num_disptaches_failed = typed.iter().filter(|&&res| res != 0).count(); + let ratio = (num_disptaches_failed as f32 / OUTPUT_ARRAY_SIZE as f32) * 100.; + + assert!( + num_disptaches_failed == 0, + "Zero-initialization of workgroup memory failed ({:.0}% of disptaches failed).", + ratio + ); + + drop(mapped); + mapping_buffer.unmap(); +} diff --git a/wgpu/tests/shader/zero_init_workgroup_mem.wgsl b/wgpu/tests/shader/zero_init_workgroup_mem.wgsl new file mode 100644 index 0000000000..638b89edab --- /dev/null +++ b/wgpu/tests/shader/zero_init_workgroup_mem.wgsl @@ -0,0 +1,31 @@ +const array_size = 512u; + +struct WStruct { + arr: array, + atom: atomic +} + +var w_mem: WStruct; + +@group(0) @binding(0) +var output: array; + +@compute @workgroup_size(1) +fn read(@builtin(workgroup_id) wgid: vec3, @builtin(num_workgroups) num_workgroups: vec3) { + var is_zero = true; + for(var i = 0u; i < array_size; i++) { + is_zero &= w_mem.arr[i] == 0u; + } + is_zero &= atomicLoad(&w_mem.atom) == 0u; + + let idx = wgid.x + (wgid.y * num_workgroups.x) + (wgid.z * num_workgroups.x * num_workgroups.y); + output[idx] = u32(!is_zero); +} + +@compute @workgroup_size(1) +fn write() { + for(var i = 0u; i < array_size; i++) { + w_mem.arr[i] = i; + } + atomicStore(&w_mem.atom, 3u); +}