diff --git a/README.md b/README.md index 5be6a8ad..52781d07 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,13 @@ If an application fails to start under ZLUDA or crashes please check [Known Issu ### Applications +#### Meshroom + +Meshroom works only with on Windows due to an underlying ROCm/HIP issue. + +Meshroom 2023.3.0 might not work, it's recommended to use Meshroom freshly built from develop branch. See #79 and alicevision/Meshroom#595. Please open an issue here if you run into problems. + + #### llama.cpp If you are building llama.cpp with cmake and don't want it to crash on ZLUDA then you should use `CUDA_DOCKER_ARCH=compute_61` like this: diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index fec881a6..f522728c 100644 Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index bea4202d..e5648105 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -232,23 +232,23 @@ static __device__ void image_store_pck(T value, typename Coordinates::type uint value_dword = transmute(value); if constexpr (geo == ImageGeometry::_1D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:1D unorm" : : "v"(value_dword), "v"(coord.x), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0x1 dim:1D unorm" : : "v"(value_dword), "v"(coord.x), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_2D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:2D unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0x1 dim:2D unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_3D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:3D unorm" : : "v"(value_dword), "v"(transmute(coord)), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0x1 dim:3D unorm" : : "v"(value_dword), "v"(transmute(coord)), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A1D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A2D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); } else { @@ -260,23 +260,23 @@ static __device__ void image_store_pck(T value, typename Coordinates::type uint2::Native_vec_ value_dword2 = transmute(value); if constexpr (geo == ImageGeometry::_1D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:1D unorm" : : "v"(value_dword2), "v"(coord.x), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0x3 dim:1D unorm" : : "v"(value_dword2), "v"(coord.x), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_2D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:2D unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0x3 dim:2D unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_3D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:3D unorm" : : "v"(value_dword2), "v"(transmute(coord)), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0x3 dim:3D unorm" : : "v"(value_dword2), "v"(transmute(coord)), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A1D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A2D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); } else { @@ -288,23 +288,23 @@ static __device__ void image_store_pck(T value, typename Coordinates::type uint4::Native_vec_ value_dword4 = transmute(value); if constexpr (geo == ImageGeometry::_1D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:1D unorm" : : "v"(value_dword4), "v"(coord.x), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0xf dim:1D unorm" : : "v"(value_dword4), "v"(coord.x), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_2D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:2D unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0xf dim:2D unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_3D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:3D unorm" : : "v"(value_dword4), "v"(transmute(coord)), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0xf dim:3D unorm" : : "v"(value_dword4), "v"(transmute(coord)), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A1D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A2D) { - asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + asm("image_store_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); } else { @@ -325,23 +325,23 @@ static __device__ T image_load_pck(typename Coordinates::type coord, surfac uint data; if constexpr (geo == ImageGeometry::_1D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0x1 dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_2D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0x1 dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_3D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0x1 dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A1D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A2D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else { @@ -354,23 +354,23 @@ static __device__ T image_load_pck(typename Coordinates::type coord, surfac uint2::Native_vec_ data; if constexpr (geo == ImageGeometry::_1D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0x3 dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_2D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0x3 dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_3D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0x3 dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A1D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A2D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else { @@ -383,23 +383,23 @@ static __device__ T image_load_pck(typename Coordinates::type coord, surfac uint4::Native_vec_ data; if constexpr (geo == ImageGeometry::_1D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0xf dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_2D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0xf dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::_3D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0xf dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A1D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A2D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else { @@ -419,23 +419,23 @@ static __device__ uint4::Native_vec_ image_load_pck_full(typename Coordinates(coord)), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0xf dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A1D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else if constexpr (geo == ImageGeometry::A2D) { - asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + asm("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); } else { @@ -715,16 +715,26 @@ extern "C" tex_1d_f16(s32, int, tex1Dfetch_f16); tex_1d_f16(f32, float, tex1D_f16); -#define tex_2d(CHANNEL_TYPE, HIP_CHANNEL_TYPE, COORD_TYPE, HIP_COORD_TYPE) \ - HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, HIP_COORD_TYPE##2 ::Native_vec_ coord) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - return tex2D(textureObject, float(coord.x), float(coord.y)).data; \ - } \ - HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_indirect_2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, HIP_COORD_TYPE##2 ::Native_vec_ coord) \ - { \ - hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ - return tex2D(textureObject, float(coord.x), float(coord.y)).data; \ +#define tex_2d(CHANNEL_TYPE, HIP_CHANNEL_TYPE, COORD_TYPE, HIP_COORD_TYPE) \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, HIP_COORD_TYPE##2 ::Native_vec_ coord) \ + { \ + hipTextureObject_t textureObject = ptr->textureObject; \ + return tex2D(textureObject, float(coord.x), float(coord.y)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_level_2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, HIP_COORD_TYPE##2 ::Native_vec_ coord, HIP_COORD_TYPE lod) \ + { \ + hipTextureObject_t textureObject = ptr->textureObject; \ + return tex2DLod(textureObject, float(coord.x), float(coord.y), float(lod)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_indirect_2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, HIP_COORD_TYPE##2 ::Native_vec_ coord) \ + { \ + hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ + return tex2D(textureObject, float(coord.x), float(coord.y)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_level_indirect_2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, HIP_COORD_TYPE##2 ::Native_vec_ coord, HIP_COORD_TYPE lod) \ + { \ + hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ + return tex2DLod(textureObject, float(coord.x), float(coord.y), float(lod)).data; \ } __device__ half4 __ockl_image_sampleh_2D(unsigned int CONSTANT_SPACE *i, unsigned int ADDRESS_SPACE_CONSTANT *s, float2::Native_vec_ c); @@ -751,16 +761,26 @@ extern "C" tex_2d_f16(s32, int); tex_2d_f16(f32, float); -#define tex_3d(CHANNEL_TYPE, HIP_CHANNEL_TYPE, COORD_TYPE, HIP_COORD_TYPE) \ - HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_3d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, HIP_COORD_TYPE##4 ::Native_vec_ coord) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - return tex3D(textureObject, float(coord.x), float(coord.y), float(coord.z)).data; \ - } \ - HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_indirect_3d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, HIP_COORD_TYPE##4 ::Native_vec_ coord) \ - { \ - hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ - return tex3D(textureObject, float(coord.x), float(coord.y), float(coord.z)).data; \ +#define tex_3d(CHANNEL_TYPE, HIP_CHANNEL_TYPE, COORD_TYPE, HIP_COORD_TYPE) \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_3d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, HIP_COORD_TYPE##4 ::Native_vec_ coord) \ + { \ + hipTextureObject_t textureObject = ptr->textureObject; \ + return tex3D(textureObject, float(coord.x), float(coord.y), float(coord.z)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_level_3d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, HIP_COORD_TYPE##4 ::Native_vec_ coord, HIP_COORD_TYPE lod) \ + { \ + hipTextureObject_t textureObject = ptr->textureObject; \ + return tex3DLod(textureObject, float(coord.x), float(coord.y), float(coord.z), float(lod)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_indirect_3d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, HIP_COORD_TYPE##4 ::Native_vec_ coord) \ + { \ + hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ + return tex3D(textureObject, float(coord.x), float(coord.y), float(coord.z)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_level_indirect_3d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, HIP_COORD_TYPE##4 ::Native_vec_ coord, HIP_COORD_TYPE lod) \ + { \ + hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ + return tex3DLod(textureObject, float(coord.x), float(coord.y), float(coord.z), float(lod)).data; \ } __device__ half4 __ockl_image_sampleh_3D(unsigned int CONSTANT_SPACE *i, unsigned int ADDRESS_SPACE_CONSTANT *s, float4::Native_vec_ c); @@ -787,16 +807,26 @@ extern "C" tex_3d_f16(s32, int); tex_3d_f16(f32, float); -#define tex_a1d(CHANNEL_TYPE, HIP_CHANNEL_TYPE, COORD_TYPE, HIP_COORD_TYPE) \ - HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_a1d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, HIP_COORD_TYPE x) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - return tex1DLayered(textureObject, float(x), int(layer)).data; \ - } \ - HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_indirect_a1d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, uint32_t layer, HIP_COORD_TYPE x) \ - { \ - hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ - return tex1DLayered(textureObject, float(x), int(layer)).data; \ +#define tex_a1d(CHANNEL_TYPE, HIP_CHANNEL_TYPE, COORD_TYPE, HIP_COORD_TYPE) \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_a1d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, HIP_COORD_TYPE x) \ + { \ + hipTextureObject_t textureObject = ptr->textureObject; \ + return tex1DLayered(textureObject, float(x), int(layer)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_level_a1d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, HIP_COORD_TYPE x, HIP_COORD_TYPE lod) \ + { \ + hipTextureObject_t textureObject = ptr->textureObject; \ + return tex1DLayeredLod(textureObject, float(x), int(layer), float(lod)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_indirect_a1d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, uint32_t layer, HIP_COORD_TYPE x) \ + { \ + hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ + return tex1DLayered(textureObject, float(x), int(layer)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_level_indirect_a1d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, uint32_t layer, HIP_COORD_TYPE x, HIP_COORD_TYPE lod) \ + { \ + hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ + return tex1DLayeredLod(textureObject, float(x), int(layer), float(lod)).data; \ } __device__ half4 __ockl_image_sampleh_1Da(unsigned int CONSTANT_SPACE *i, unsigned int ADDRESS_SPACE_CONSTANT *s, float2::Native_vec_ c); @@ -823,16 +853,26 @@ extern "C" tex_a1d_f16(s32, int); tex_a1d_f16(f32, float); -#define tex_a2d(CHANNEL_TYPE, HIP_CHANNEL_TYPE, COORD_TYPE, HIP_COORD_TYPE) \ - HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_a2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, HIP_COORD_TYPE x, HIP_COORD_TYPE y) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - return tex2DLayered(textureObject, float(x), float(y), int(layer)).data; \ - } \ - HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_indirect_a2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, uint32_t layer, HIP_COORD_TYPE x, HIP_COORD_TYPE y) \ - { \ - hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ - return tex2DLayered(textureObject, float(x), float(y), int(layer)).data; \ +#define tex_a2d(CHANNEL_TYPE, HIP_CHANNEL_TYPE, COORD_TYPE, HIP_COORD_TYPE) \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_a2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, HIP_COORD_TYPE x, HIP_COORD_TYPE y) \ + { \ + hipTextureObject_t textureObject = ptr->textureObject; \ + return tex2DLayered(textureObject, float(x), float(y), int(layer)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_level_a2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, HIP_COORD_TYPE x, HIP_COORD_TYPE y, HIP_COORD_TYPE lod) \ + { \ + hipTextureObject_t textureObject = ptr->textureObject; \ + return tex2DLayeredLod(textureObject, float(x), float(y), int(layer), float(lod)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_indirect_a2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, uint32_t layer, HIP_COORD_TYPE x, HIP_COORD_TYPE y) \ + { \ + hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ + return tex2DLayered(textureObject, float(x), float(y), int(layer)).data; \ + } \ + HIP_CHANNEL_TYPE##4 ::Native_vec_ FUNC(tex_level_indirect_a2d_v4_##CHANNEL_TYPE##_##COORD_TYPE)(uint64_t texobj, uint32_t layer, HIP_COORD_TYPE x, HIP_COORD_TYPE y, HIP_COORD_TYPE lod) \ + { \ + hipTextureObject_t textureObject = (hipTextureObject_t)texobj; \ + return tex2DLayeredLod(textureObject, float(x), float(y), int(layer), float(lod)).data; \ } __device__ half4 __ockl_image_sampleh_2Da(unsigned int CONSTANT_SPACE *i, unsigned int ADDRESS_SPACE_CONSTANT *s, float4::Native_vec_ c); diff --git a/ptx/src/ast.rs b/ptx/src/ast.rs index e5b5f976..55687739 100644 --- a/ptx/src/ast.rs +++ b/ptx/src/ast.rs @@ -460,8 +460,8 @@ pub enum Instruction { Membar { level: MemScope, }, - Tex(TexDetails, Arg4Tex

), - Suld(SurfaceDetails, Arg4Tex

), + Tex(TexDetails, Arg5Tex

), + Suld(SurfaceDetails, Arg5Tex

), Sust(SurfaceDetails, Arg4Sust

), Shfl(ShflMode, Arg5Shfl

), Shf(FunnelShift, Arg4

), @@ -476,6 +476,7 @@ pub enum Instruction { MatchAny(Arg3

), Red(AtomDetails, Arg2St

), Nanosleep(Arg1

), + Isspacep(StateSpace, Arg2

), Sad(ScalarType, Arg4

), } @@ -617,13 +618,6 @@ pub struct Arg4Setp { pub src2: P::Operand, } -pub struct Arg4Tex { - pub dst: P::Operand, - pub image: P::Operand, - pub layer: Option, - pub coordinates: P::Operand, -} - pub struct Arg4Sust { pub image: P::Operand, pub coordinates: P::Operand, @@ -639,6 +633,14 @@ pub struct Arg5 { pub src4: P::Operand, } +pub struct Arg5Tex { + pub dst: P::Operand, + pub image: P::Operand, + pub layer: Option, + pub coordinates: P::Operand, + pub lod: Option, +} + pub struct Arg5Setp { pub dst1: P::Id, pub dst2: Option, @@ -1317,6 +1319,7 @@ pub enum TuningDirective { MaxNtid(u32, u32, u32), ReqNtid(u32, u32, u32), MinNCtaPerSm(u32), + Noreturn, } #[repr(u8)] @@ -1382,8 +1385,8 @@ pub enum TextureGeometry { #[derive(Clone)] pub enum Initializer { Constant(ImmediateValue), - Global(ID, Type), - GenericGlobal(ID, Type), + Global(ID), + GenericGlobal(ID), Add(Box<(Initializer, Initializer)>), Array(Vec>), } diff --git a/ptx/src/emit.rs b/ptx/src/emit.rs index e2d00d98..1c88cd0e 100644 --- a/ptx/src/emit.rs +++ b/ptx/src/emit.rs @@ -403,27 +403,20 @@ unsafe fn get_llvm_const( let const2 = get_llvm_const(ctx, type_, Some(init2))?; LLVMConstAdd(const1, const2) } - (_, Some(ast::Initializer::Global(id, type_))) => { + (_, Some(ast::Initializer::Global(id))) => { let name = ctx.names.value(id)?; let b64 = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::B64))?; - let mut zero = LLVMConstInt(b64, 0, 0); - let src_type = get_llvm_type(ctx, &type_)?; - let global_ptr = LLVMConstInBoundsGEP2(src_type, name, &mut zero, 1); - LLVMConstPtrToInt(global_ptr, b64) + LLVMConstPtrToInt(name, b64) } - (_, Some(ast::Initializer::GenericGlobal(id, type_))) => { + (_, Some(ast::Initializer::GenericGlobal(id))) => { let name = ctx.names.value(id)?; - let b64 = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::B64))?; - let mut zero = LLVMConstInt(b64, 0, 0); - let src_type = get_llvm_type(ctx, &type_)?; - let global_ptr = LLVMConstInBoundsGEP2(src_type, name, &mut zero, 1); - // void pointers are illegal in LLVM IR let b8 = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::B8))?; let b8_generic_ptr = LLVMPointerType( b8, get_llvm_address_space(&ctx.constants, ast::StateSpace::Generic)?, ); - let generic_ptr = LLVMConstAddrSpaceCast(global_ptr, b8_generic_ptr); + let generic_ptr = LLVMConstAddrSpaceCast(name, b8_generic_ptr); + let b64 = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::B64))?; LLVMConstPtrToInt(generic_ptr, b64) } _ => return Err(TranslateError::todo()), @@ -551,7 +544,7 @@ fn emit_method<'a, 'input>( emit_statement(ctx, is_kernel, statement)?; } // happens if there is a post-ret trailing label - terminate_current_block_if_needed(ctx, None); + terminate_current_block_if_not_terminated(ctx, None); unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), bb_with_variables) }; unsafe { LLVMBuildBr(ctx.builder.get(), starting_bb) }; Ok(()) @@ -593,6 +586,17 @@ fn emit_tuning_single<'a>( format!("{0},{0}", size).as_bytes(), ); } + ast::TuningDirective::Noreturn => { + let noreturn = b"noreturn"; + let attr_kind = unsafe { + LLVMGetEnumAttributeKindForName(noreturn.as_ptr().cast(), noreturn.len()) + }; + if attr_kind == 0 { + panic!(); + } + let noreturn = unsafe { LLVMCreateEnumAttribute(ctx.context.get(), attr_kind, 0) }; + unsafe { LLVMAddAttributeAtIndex(llvm_method, LLVMAttributeFunctionIndex, noreturn) }; + } } } @@ -621,6 +625,9 @@ fn emit_statement( is_kernel: bool, statement: crate::translate::ExpandedStatement, ) -> Result<(), TranslateError> { + if !matches!(statement, crate::translate::Statement::Label(..)) { + start_next_block_if_terminated(ctx); + } Ok(match statement { crate::translate::Statement::Label(label) => emit_label(ctx, label)?, crate::translate::Statement::Variable(var) => emit_function_variable(ctx, var)?, @@ -1155,6 +1162,7 @@ fn emit_instruction( ast::Instruction::Vshr(arg) => emit_inst_vshr(ctx, arg)?, ast::Instruction::Set(details, arg) => emit_inst_set(ctx, details, arg)?, ast::Instruction::Red(details, arg) => emit_inst_red(ctx, details, arg)?, + ast::Instruction::Isspacep(space, arg) => emit_inst_isspacep(ctx, *space, arg)?, ast::Instruction::Sad(type_, arg) => emit_inst_sad(ctx, *type_, arg)?, // replaced by function calls or Statement variants ast::Instruction::Activemask { .. } @@ -1180,6 +1188,70 @@ fn emit_instruction( }) } +fn emit_inst_isspacep( + ctx: &mut EmitContext, + space: ast::StateSpace, + arg: &ast::Arg2, +) -> Result<(), TranslateError> { + match space { + ast::StateSpace::Local => { + emit_inst_isspacep_impl(ctx, Some(arg.dst), arg.src, b"llvm.amdgcn.is.private\0")?; + Ok(()) + } + ast::StateSpace::Shared => { + emit_inst_isspacep_impl(ctx, Some(arg.dst), arg.src, b"llvm.amdgcn.is.shared\0")?; + Ok(()) + } + ast::StateSpace::Global => { + let builder = ctx.builder.get(); + let is_private = + emit_inst_isspacep_impl(ctx, None, arg.src, b"llvm.amdgcn.is.private\0")?; + let is_shared = + emit_inst_isspacep_impl(ctx, None, arg.src, b"llvm.amdgcn.is.shared\0")?; + let private_or_shared = + unsafe { LLVMBuildOr(builder, is_private, is_shared, LLVM_UNNAMED) }; + let i1_true = unsafe { + LLVMConstInt( + get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::Pred))?, + 1, + 0, + ) + }; + ctx.names.register_result(arg.dst, |dst| unsafe { + // I'd rathr user LLVMBuildNeg(...), but when using LLVMBuildNeg(...) in LLVM 15, + // LLVM emits this broken IR: + // %"14" = sub i1 false, %4 + LLVMBuildSub(builder, i1_true, private_or_shared, dst) + }); + Ok(()) + } + _ => Err(TranslateError::unreachable()), + } +} + +fn emit_inst_isspacep_impl( + ctx: &mut EmitContext, + dst: Option, + src: Id, + intrinsic: &[u8], +) -> Result { + let src = ctx.names.value(src)?; + let b8 = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::B8))?; + let b8_generic_ptr = unsafe { + LLVMPointerType( + b8, + get_llvm_address_space(&ctx.constants, ast::StateSpace::Generic)?, + ) + }; + let src = unsafe { LLVMBuildIntToPtr(ctx.builder.get(), src, b8_generic_ptr, LLVM_UNNAMED) }; + emit_intrinsic_arg2( + ctx, + (ast::ScalarType::Pred, dst), + (ast::ScalarType::B8, ast::StateSpace::Generic, src), + intrinsic, + ) +} + fn emit_inst_sad( ctx: &mut EmitContext, type_: ast::ScalarType, @@ -1255,7 +1327,8 @@ fn emit_inst_bfind( let builder = ctx.builder.get(); let src = arg.src.get_llvm_value(&mut ctx.names)?; let llvm_dst_type = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::U32))?; - let const_0 = unsafe { LLVMConstInt(llvm_dst_type, 0, 0) }; + let llvm_src_type = get_llvm_type(ctx, &ast::Type::Scalar(details.type_))?; + let const_0 = unsafe { LLVMConstInt(llvm_src_type, 0, 0) }; let const_int_max = unsafe { LLVMConstInt(llvm_dst_type, u64::MAX, 0) }; let is_zero = unsafe { LLVMBuildICmp( @@ -1266,7 +1339,7 @@ fn emit_inst_bfind( LLVM_UNNAMED, ) }; - let mut clz_result = emit_inst_clz_impl(ctx, ast::ScalarType::U32, None, arg.src, true)?; + let mut clz_result = emit_inst_clz_impl(ctx, details.type_, None, arg.src, true)?; if !details.shift { let bits = unsafe { LLVMConstInt( @@ -1442,7 +1515,7 @@ fn emit_inst_abs( emit_intrinsic_arg2( ctx, (details.typ, Some(args.dst)), - (details.typ, args.src), + (details.typ, ast::StateSpace::Reg, args.src), intrinsic_name.as_bytes(), )?; } else { @@ -1610,7 +1683,7 @@ fn emit_inst_rsqrt( let sqrt_result = emit_intrinsic_arg2( ctx, (details.typ, None), - (details.typ, args.src), + (details.typ, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; unsafe { LLVMZludaSetFastMathFlags(sqrt_result, FastMathFlags::ApproxFunc) }; @@ -1668,7 +1741,7 @@ fn emit_inst_sqrt( let sqrt_result = emit_intrinsic_arg2( ctx, (details.type_, Some(args.dst)), - (details.type_, args.src), + (details.type_, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; unsafe { LLVMZludaSetFastMathFlags(sqrt_result, fast_math) }; @@ -2491,7 +2564,7 @@ fn emit_inst_cvt( emit_intrinsic_arg2( ctx, (type_, Some(args.dst)), - (type_, args.src), + (type_, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; } @@ -2505,7 +2578,7 @@ fn emit_inst_cvt( emit_intrinsic_arg2( ctx, (type_, Some(args.dst)), - (type_, args.src), + (type_, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; } @@ -2519,7 +2592,7 @@ fn emit_inst_cvt( emit_intrinsic_arg2( ctx, (type_, Some(args.dst)), - (type_, args.src), + (type_, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; } @@ -2533,7 +2606,7 @@ fn emit_inst_cvt( emit_intrinsic_arg2( ctx, (type_, Some(args.dst)), - (type_, args.src), + (type_, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; } @@ -2699,7 +2772,7 @@ fn emit_inst_cos( let cos_value = emit_intrinsic_arg2( ctx, (ast::ScalarType::F32, Some(args.dst)), - (ast::ScalarType::F32, args.src), + (ast::ScalarType::F32, ast::StateSpace::Reg, args.src), function_name, )?; unsafe { LLVMZludaSetFastMathFlags(cos_value, FastMathFlags::ApproxFunc) }; @@ -2714,7 +2787,7 @@ fn emit_inst_sin( let cos_value = emit_intrinsic_arg2( ctx, (ast::ScalarType::F32, Some(args.dst)), - (ast::ScalarType::F32, args.src), + (ast::ScalarType::F32, ast::StateSpace::Reg, args.src), function_name, )?; unsafe { LLVMZludaSetFastMathFlags(cos_value, FastMathFlags::ApproxFunc) }; @@ -2918,7 +2991,7 @@ fn emit_inst_brev( emit_intrinsic_arg2( ctx, (type_, Some(args.dst)), - (type_, args.src), + (type_, ast::StateSpace::Reg, args.src), function_name, )?; Ok(()) @@ -2936,8 +3009,12 @@ fn emit_inst_popc( _ => return Err(TranslateError::unreachable()), }; let popc_dst = if shorten { None } else { Some(args.dst) }; - let popc_result = - emit_intrinsic_arg2(ctx, (type_, popc_dst), (type_, args.src), function_name)?; + let popc_result = emit_intrinsic_arg2( + ctx, + (type_, popc_dst), + (type_, ast::StateSpace::Reg, args.src), + function_name, + )?; if shorten { let llvm_i32 = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::U32))?; ctx.names.register_result(args.dst, |dst_name| unsafe { @@ -2955,7 +3032,7 @@ fn emit_inst_ex2( let llvm_value = emit_intrinsic_arg2( ctx, (ast::ScalarType::F32, Some(args.dst)), - (ast::ScalarType::F32, args.src), + (ast::ScalarType::F32, ast::StateSpace::Reg, args.src), function_name, )?; unsafe { LLVMZludaSetFastMathFlags(llvm_value, FastMathFlags::ApproxFunc) }; @@ -2970,7 +3047,7 @@ fn emit_inst_lg2( let llvm_value = emit_intrinsic_arg2( ctx, (ast::ScalarType::F32, Some(args.dst)), - (ast::ScalarType::F32, args.src), + (ast::ScalarType::F32, ast::StateSpace::Reg, args.src), function_name, )?; unsafe { LLVMZludaSetFastMathFlags(llvm_value, FastMathFlags::ApproxFunc) }; @@ -3009,16 +3086,16 @@ fn emit_intrinsic_arg0( fn emit_intrinsic_arg2( ctx: &mut EmitContext, (dst_type, dst): (ast::ScalarType, Option), - (src_type, src): (ast::ScalarType, Id), + (src_type, src_space, src): (ast::ScalarType, ast::StateSpace, impl GetLLVMValue), intrinsic_name: &[u8], ) -> Result { let builder = ctx.builder.get(); - let mut llvm_src = ctx.names.value(src)?; + let mut llvm_src = src.get_llvm_value(&mut ctx.names)?; let dst_type = get_llvm_type(ctx, &ast::Type::Scalar(dst_type))?; let function_type = get_llvm_function_type( ctx, dst_type, - iter::once((&ast::Type::Scalar(src_type), ast::StateSpace::Reg)), + iter::once((&ast::Type::Scalar(src_type), src_space)), )?; let mut function_value = unsafe { LLVMGetNamedFunction(ctx.module.get(), intrinsic_name.as_ptr() as _) }; @@ -3508,12 +3585,15 @@ fn emit_store_var( fn emit_label(ctx: &mut EmitContext, label: Id) -> Result<(), TranslateError> { let new_block = unsafe { LLVMValueAsBasicBlock(ctx.names.value(label)?) }; - terminate_current_block_if_needed(ctx, Some(new_block)); + terminate_current_block_if_not_terminated(ctx, Some(new_block)); unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), new_block) }; Ok(()) } -fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: Option) { +fn terminate_current_block_if_not_terminated( + ctx: &mut EmitContext, + new_block: Option, +) { let current_block = unsafe { LLVMGetInsertBlock(ctx.builder.get()) }; if current_block == ptr::null_mut() { return; @@ -3528,6 +3608,20 @@ fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: Option( ctx: &mut EmitContext, method: &crate::translate::Function<'input>, diff --git a/ptx/src/ptx.lalrpop b/ptx/src/ptx.lalrpop index 5ec97e15..612d3bda 100644 --- a/ptx/src/ptx.lalrpop +++ b/ptx/src/ptx.lalrpop @@ -97,6 +97,7 @@ match { ".l", ".le", ".leu", + ".level", ".lo", ".loc", ".local", @@ -116,6 +117,7 @@ match { ".ne", ".neu", ".noftz", + ".noreturn", ".num", ".or", ".param", @@ -201,6 +203,7 @@ match { "function_name", "generic", "inlined_at", + "isspacep", "ld", "ldmatrix", "lg2", @@ -283,6 +286,7 @@ ExtendedID : &'input str = { "function_name", "generic", "inlined_at", + "isspacep", "ld", "ldmatrix", "lg2", @@ -531,6 +535,8 @@ LinkingDirective: ast::LinkingDirective = { }; TuningDirective: ast::TuningDirective = { + // not a performance tuning directive but fits here in the grammar + ".noreturn" => ast::TuningDirective::Noreturn, ".maxnreg" => ast::TuningDirective::MaxNReg(ncta), ".maxntid" => ast::TuningDirective::MaxNtid(nx, 1, 1), ".maxntid" "," => ast::TuningDirective::MaxNtid(nx, ny, 1), @@ -648,8 +654,8 @@ Initializer: ast::Initializer<&'input str> = { InitializerNoAdd: ast::Initializer<&'input str> = { => ast::Initializer::Constant(val), - => ast::Initializer::Global(id, ast::Type::Struct(Vec::new())), - "generic" "(" ")" => ast::Initializer::GenericGlobal(id, ast::Type::Struct(Vec::new())), + => ast::Initializer::Global(id), + "generic" "(" ")" => ast::Initializer::GenericGlobal(id), "{" > "}" => ast::Initializer::Array(array_init) } @@ -841,6 +847,7 @@ Instruction: ast::Instruction> = { InstMatch, InstRed, InstNanosleep, + InstIsspacep, InstSad }; @@ -2067,11 +2074,23 @@ InstMembar: ast::Instruction> = { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions-tex InstTex: ast::Instruction> = { "tex" ".v4" "," "[" "," "]" => { - let args = ast::Arg4Tex { + let args = ast::Arg5Tex { dst, image, coordinates, - layer: None + layer: None, + lod: None + }; + let details = ast::TexDetails { geometry, channel_type, coordinate_type, direct: false }; + ast::Instruction::Tex(details, args) + }, + "tex" ".level" ".v4" "," "[" "," "]" "," => { + let args = ast::Arg5Tex { + dst, + image, + coordinates, + layer: None, + lod: Some(lod) }; let details = ast::TexDetails { geometry, channel_type, coordinate_type, direct: false }; ast::Instruction::Tex(details, args) @@ -2082,11 +2101,25 @@ InstTex: ast::Instruction> = { "tex" ".a1d" ".v4" "," "[" "," "{" "," "}" "]" => { let geometry = ast::TextureGeometry::Array1D; - let args = ast::Arg4Tex { + let args = ast::Arg5Tex { dst, image, coordinates: ast::Operand::VecPack(vec![x]), - layer: Some(layer) + layer: Some(layer), + lod: None + }; + let details = ast::TexDetails { geometry, channel_type, coordinate_type, direct: false }; + ast::Instruction::Tex(details, args) + }, + "tex" ".level" ".a1d" ".v4" + "," "[" "," "{" "," "}" "]" "," => { + let geometry = ast::TextureGeometry::Array1D; + let args = ast::Arg5Tex { + dst, + image, + coordinates: ast::Operand::VecPack(vec![x]), + layer: Some(layer), + lod: Some(lod) }; let details = ast::TexDetails { geometry, channel_type, coordinate_type, direct: false }; ast::Instruction::Tex(details, args) @@ -2094,11 +2127,25 @@ InstTex: ast::Instruction> = { "tex" ".a2d" ".v4" "," "[" "," "{" "," "," "," RegOrImmediate "}" "]" => { let geometry = ast::TextureGeometry::Array2D; - let args = ast::Arg4Tex { + let args = ast::Arg5Tex { dst, image, coordinates: ast::Operand::VecPack(vec![x, y]), - layer: Some(layer) + layer: Some(layer), + lod: None + }; + let details = ast::TexDetails { geometry, channel_type, coordinate_type, direct: false }; + ast::Instruction::Tex(details, args) + }, + "tex" ".level" ".a2d" ".v4" + "," "[" "," "{" "," "," "," RegOrImmediate "}" "]" "," => { + let geometry = ast::TextureGeometry::Array2D; + let args = ast::Arg5Tex { + dst, + image, + coordinates: ast::Operand::VecPack(vec![x, y]), + layer: Some(layer), + lod: Some(lod) }; let details = ast::TexDetails { geometry, channel_type, coordinate_type, direct: false }; ast::Instruction::Tex(details, args) @@ -2108,33 +2155,36 @@ InstTex: ast::Instruction> = { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#surface-instructions-suld InstSuld: ast::Instruction> = { "suld" ".b" ".trap" "," "[" "," "]" => { - let args = ast::Arg4Tex { + let args = ast::Arg5Tex { dst, image, coordinates, layer: None, + lod: None, }; let details = ast::SurfaceDetails { geometry, vector, type_, direct: false, }; ast::Instruction::Suld(details, args) }, "suld" ".b" ".a1d" ".trap" "," "[" "," "{" "," "}" "]" => { let geometry = ast::TextureGeometry::Array1D; - let args = ast::Arg4Tex { + let args = ast::Arg5Tex { dst, image, coordinates: ast::Operand::VecPack(vec![x]), layer: Some(layer), + lod: None, }; let details = ast::SurfaceDetails { geometry, vector, type_, direct: false, }; ast::Instruction::Suld(details, args) }, "suld" ".b" ".a2d" ".trap" "," "[" "," "{" "," "," "," RegOrImmediate "}" "]" => { let geometry = ast::TextureGeometry::Array2D; - let args = ast::Arg4Tex { + let args = ast::Arg5Tex { dst, image, coordinates: ast::Operand::VecPack(vec![x, y]), layer: Some(layer), + lod: None, }; let details = ast::SurfaceDetails { geometry, vector, type_, direct: false, }; ast::Instruction::Suld(details, args) @@ -2380,6 +2430,20 @@ InstNanosleep: ast::Instruction> = { } } +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep +InstIsspacep: ast::Instruction> = { + "isspacep" ".local" => { + ast::Instruction::Isspacep(ast::StateSpace::Local, a) + }, + "isspacep" ".shared" => { + ast::Instruction::Isspacep(ast::StateSpace::Shared, a) + }, + "isspacep" ".global" => { + ast::Instruction::Isspacep(ast::StateSpace::Global, a) + } +} + + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad InstSad: ast::Instruction> = { "sad" => { @@ -2387,6 +2451,7 @@ InstSad: ast::Instruction> = { } } + NegTypeFtz: ast::ScalarType = { ".f16" => ast::ScalarType::F16, ".f16x2" => ast::ScalarType::F16x2, diff --git a/ptx/src/raytracing.rs b/ptx/src/raytracing.rs index 4c68a112..25c2e7c6 100644 --- a/ptx/src/raytracing.rs +++ b/ptx/src/raytracing.rs @@ -2996,11 +2996,12 @@ fn convert_optix_builtin_variable_and_attribute_access_single_function<'input>( } Statement::Instruction(ast::Instruction::Tex( tex, - ast::Arg4Tex { + ast::Arg5Tex { dst, image, layer, coordinates, + lod, }, )) => { if let Some(StateSpaceRemapping::ToBlock(id, ast::StateSpace::Global, offset)) = @@ -3014,11 +3015,12 @@ fn convert_optix_builtin_variable_and_attribute_access_single_function<'input>( )?; result.push(Statement::Instruction(ast::Instruction::Tex( tex, - ast::Arg4Tex { + ast::Arg5Tex { dst, image, layer, coordinates, + lod, }, ))); } else { diff --git a/ptx/src/test/spirv_build/noreturn.ll b/ptx/src/test/spirv_build/noreturn.ll new file mode 100644 index 00000000..286b289a --- /dev/null +++ b/ptx/src/test/spirv_build/noreturn.ll @@ -0,0 +1,19 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; Function Attrs: noreturn +define private void @noreturn(i64 %"6") #0 { +"9": + %"3" = alloca i64, align 8, addrspace(5) + %"4" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"4", align 1 + %"5" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"5", align 1 + %"8" = alloca i64, align 8, addrspace(5) + store i64 %"6", ptr addrspace(5) %"3", align 8 + %"7" = load i64, ptr addrspace(5) %"3", align 8 + store i64 %"7", ptr addrspace(5) %"8", align 8 + ret void +} + +attributes #0 = { noreturn "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_build/noreturn.ptx b/ptx/src/test/spirv_build/noreturn.ptx new file mode 100644 index 00000000..fd34bc6a --- /dev/null +++ b/ptx/src/test/spirv_build/noreturn.ptx @@ -0,0 +1,8 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.weak .func noreturn(.param .b64 noreturn_0) +.noreturn +{ +} \ No newline at end of file diff --git a/ptx/src/test/spirv_run/call_global_ptr.ll b/ptx/src/test/spirv_run/call_global_ptr.ll new file mode 100644 index 00000000..edd07ebf --- /dev/null +++ b/ptx/src/test/spirv_run/call_global_ptr.ll @@ -0,0 +1,71 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +@fn_ptrs = protected addrspace(1) externally_initialized global [2 x i64] [i64 0, i64 ptrtoint (ptr @incr to i64)], align 8 + +define private i64 @incr(i64 %"36") #0 { +"60": + %"21" = alloca i64, align 8, addrspace(5) + %"20" = alloca i64, align 8, addrspace(5) + %"24" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"24", align 1 + %"25" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"25", align 1 + %"51" = alloca i64, align 8, addrspace(5) + %"52" = alloca i64, align 8, addrspace(5) + %"17" = alloca i64, align 8, addrspace(5) + store i64 %"36", ptr addrspace(5) %"21", align 8 + %"37" = load i64, ptr addrspace(5) %"21", align 8 + store i64 %"37", ptr addrspace(5) %"52", align 8 + %"38" = load i64, ptr addrspace(5) %"52", align 8 + store i64 %"38", ptr addrspace(5) %"17", align 8 + %"40" = load i64, ptr addrspace(5) %"17", align 8 + %"39" = add i64 %"40", 1 + store i64 %"39", ptr addrspace(5) %"17", align 8 + %"41" = load i64, ptr addrspace(5) %"17", align 8 + store i64 %"41", ptr addrspace(5) %"51", align 8 + %"42" = load i64, ptr addrspace(5) %"51", align 8 + store i64 %"42", ptr addrspace(5) %"20", align 8 + %"43" = load i64, ptr addrspace(5) %"20", align 8 + ret i64 %"43" +} + +define protected amdgpu_kernel void @call_global_ptr(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 { +"59": + %"22" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"22", align 1 + %"23" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"23", align 1 + %"8" = alloca i64, align 8, addrspace(5) + %"9" = alloca i64, align 8, addrspace(5) + %"10" = alloca i64, align 8, addrspace(5) + %"11" = alloca i64, align 8, addrspace(5) + %"49" = alloca i64, align 8, addrspace(5) + %"50" = alloca i64, align 8, addrspace(5) + %"26" = load i64, ptr addrspace(4) %"47", align 8 + store i64 %"26", ptr addrspace(5) %"8", align 8 + %"27" = load i64, ptr addrspace(4) %"48", align 8 + store i64 %"27", ptr addrspace(5) %"9", align 8 + %"29" = load i64, ptr addrspace(5) %"8", align 8 + %"53" = inttoptr i64 %"29" to ptr addrspace(1) + %"28" = load i64, ptr addrspace(1) %"53", align 8 + store i64 %"28", ptr addrspace(5) %"10", align 8 + %"30" = load i64, ptr addrspace(5) %"10", align 8 + store i64 %"30", ptr addrspace(5) %"49", align 8 + %"31" = load i64, ptr getelementptr inbounds (i8, ptr addrspacecast (ptr addrspace(1) @fn_ptrs to ptr), i64 8), align 8 + store i64 %"31", ptr addrspace(5) %"11", align 8 + %"18" = load i64, ptr addrspace(5) %"49", align 8 + %"32" = load i64, ptr addrspace(5) %"11", align 8 + %0 = inttoptr i64 %"32" to ptr + %"19" = call i64 %0(i64 %"18") + store i64 %"19", ptr addrspace(5) %"50", align 8 + %"33" = load i64, ptr addrspace(5) %"50", align 8 + store i64 %"33", ptr addrspace(5) %"10", align 8 + %"34" = load i64, ptr addrspace(5) %"9", align 8 + %"35" = load i64, ptr addrspace(5) %"10", align 8 + %"58" = inttoptr i64 %"34" to ptr addrspace(1) + store i64 %"35", ptr addrspace(1) %"58", align 8 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/call_global_ptr.ptx b/ptx/src/test/spirv_run/call_global_ptr.ptx new file mode 100644 index 00000000..59b1d262 --- /dev/null +++ b/ptx/src/test/spirv_run/call_global_ptr.ptx @@ -0,0 +1,43 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.weak .func (.param.u64 output) incr (.param.u64 input); + +.weak .global .align 8 .u64 fn_ptrs[2] = {0, incr}; + +.visible .entry call_global_ptr( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u64 temp; + .reg .u64 fn_ptr; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.global.u64 temp, [in_addr]; + .param.u64 incr_in; + .param.u64 incr_out; + st.param.b64 [incr_in], temp; +incr_fn_ptr: .callprototype (.param .u64 _) _ (.param .u64 _); + ld.u64 fn_ptr, [fn_ptrs+8]; + call (incr_out), fn_ptr, (incr_in), incr_fn_ptr; + ld.param.u64 temp, [incr_out]; + st.global.u64 [out_addr], temp; + ret; +} + +.weak .func (.param .u64 output) incr( + .param .u64 input +) +{ + .reg .u64 temp; + ld.param.u64 temp, [input]; + add.u64 temp, temp, 1; + st.param.u64 [output], temp; + ret; +} \ No newline at end of file diff --git a/ptx/src/test/spirv_run/isspacep.ll b/ptx/src/test/spirv_run/isspacep.ll new file mode 100644 index 00000000..08371e31 --- /dev/null +++ b/ptx/src/test/spirv_run/isspacep.ll @@ -0,0 +1,57 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @isspacep(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { +"36": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i1, align 1, addrspace(5) + %"7" = alloca i1, align 1, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"12" = load i64, ptr addrspace(4) %"32", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 + %"13" = load i64, ptr addrspace(4) %"33", align 8 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %0 = inttoptr i64 %"15" to ptr + %1 = call i1 @llvm.amdgcn.is.private(ptr %0) + %2 = inttoptr i64 %"15" to ptr + %3 = call i1 @llvm.amdgcn.is.shared(ptr %2) + %4 = or i1 %1, %3 + %"14" = sub i1 true, %4 + store i1 %"14", ptr addrspace(5) %"6", align 1 + %"17" = load i1, ptr addrspace(5) %"6", align 1 + %"16" = select i1 %"17", i32 1, i32 0 + store i32 %"16", ptr addrspace(5) %"8", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %5 = inttoptr i64 %"19" to ptr + %"18" = call i1 @llvm.amdgcn.is.shared(ptr %5) + store i1 %"18", ptr addrspace(5) %"7", align 1 + %"21" = load i1, ptr addrspace(5) %"7", align 1 + %"20" = select i1 %"21", i32 1, i32 0 + store i32 %"20", ptr addrspace(5) %"9", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load i32, ptr addrspace(5) %"8", align 4 + %"34" = inttoptr i64 %"22" to ptr + store i32 %"23", ptr %"34", align 4 + %"24" = load i64, ptr addrspace(5) %"5", align 8 + %"25" = load i32, ptr addrspace(5) %"9", align 4 + %"35" = inttoptr i64 %"24" to ptr + %"38" = getelementptr inbounds i8, ptr %"35", i64 4 + store i32 %"25", ptr %"38", align 4 + ret void +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare i1 @llvm.amdgcn.is.private(ptr nocapture) #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i1 @llvm.amdgcn.is.shared(ptr nocapture) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/isspacep.ptx b/ptx/src/test/spirv_run/isspacep.ptx new file mode 100644 index 00000000..55d39f54 --- /dev/null +++ b/ptx/src/test/spirv_run/isspacep.ptx @@ -0,0 +1,28 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry isspacep( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .pred is_global; + .reg .pred is_shared; + + .reg .u32 is_global_u32; + .reg .u32 is_shared_u32; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + isspacep.global is_global, in_addr; + selp.u32 is_global_u32, 1, 0, is_global; + isspacep.shared is_shared, in_addr; + selp.u32 is_shared_u32, 1, 0, is_shared; + st.u32 [out_addr], is_global_u32; + st.u32 [out_addr+4], is_shared_u32; + ret; +} diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 5fb5a8b1..71dbd061 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -127,6 +127,7 @@ test_ptx!(cvta, [3.0f32], [3.0f32]); test_ptx!(block, [1u64], [2u64]); test_ptx!(local_align, [1u64], [1u64]); test_ptx!(call, [1u64], [2u64]); +test_ptx!(call_global_ptr, [12u64], [13u64]); // In certain situations LLVM will miscompile AMDGPU binaries. // This happens if the return type of a function is a .b8 array. // This test checks if our workaround for this bug works @@ -364,6 +365,7 @@ test_ptx!( [1923569713u64, 1923569712], [1923569713u64, 1923569712] ); +test_ptx!(isspacep, [0xDEADu32], [1u32, 0]); test_ptx!(sad, [2147483648u32, 2, 13], [2147483659u32, 2147483663]); test_ptx_warp!( diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index b06fa527..99fc3562 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -392,7 +392,7 @@ impl<'a, 'input> LinkingResolver<'a, 'input> { linking, Cow::Borrowed(decl.name()), symbol, - decl.name.is_kernel(), + decl.name.is_kernel() && is_definition, ) } @@ -591,10 +591,21 @@ impl<'input> ResolvedLinking<'input> { explicit_initializer: bool, ) -> Result { if linking == ast::LinkingDirective::None { - if self.implicit_globals.get(&name).copied() == Some((module, directive)) { - Ok(VisibilityAdjustment::Global) - } else { - Ok(VisibilityAdjustment::Module) + match self.implicit_globals.get(&name).copied() { + Some((implicit_module, implicit_directive)) => { + if implicit_module == module { + if implicit_directive == directive { + Ok(VisibilityAdjustment::Global) + } else { + // If it were something other than a declaration it would + // fail module-level symbol resolution + Ok(VisibilityAdjustment::GlobalDeclaration(None)) + } + } else { + Ok(VisibilityAdjustment::Module) + } + } + None => Ok(VisibilityAdjustment::Module), } } else { if let Some((global_module, global_directive, type_)) = self.explicit_globals.get(&name) @@ -1031,10 +1042,8 @@ fn normalize_method<'a, 'b, 'input>( normalize_method_params(&mut fn_scope, &*method.func_directive.return_arguments)?; let input_arguments = normalize_method_params(&mut fn_scope, &*method.func_directive.input_arguments)?; - if !is_kernel { - if let hash_map::Entry::Vacant(entry) = function_decls.entry(name) { - entry.insert((return_arguments.clone(), input_arguments.clone())); - } + if let hash_map::Entry::Vacant(entry) = function_decls.entry(name) { + entry.insert((return_arguments.clone(), input_arguments.clone())); } let source_name = if has_global_name { Some(Cow::Borrowed(method.func_directive.name())) @@ -1188,11 +1197,9 @@ fn expand_initializer2<'a, 'b, 'input>( ) -> Result, TranslateError> { Ok(match init { ast::Initializer::Constant(c) => ast::Initializer::Constant(c), - ast::Initializer::Global(g, type_) => { - ast::Initializer::Global(scope.get_id_in_module_scope(g)?, type_) - } - ast::Initializer::GenericGlobal(g, type_) => { - ast::Initializer::GenericGlobal(scope.get_id_in_module_scope(g)?, type_) + ast::Initializer::Global(g) => ast::Initializer::Global(scope.get_id_in_module_scope(g)?), + ast::Initializer::GenericGlobal(g) => { + ast::Initializer::GenericGlobal(scope.get_id_in_module_scope(g)?) } ast::Initializer::Add(add) => { let (init1, init2) = *add; @@ -1285,11 +1292,7 @@ fn resolve_instruction_types<'input>( .map(|directive| { Ok(match directive { TranslationDirective::Variable(linking, compiled_name, var) => { - TranslationDirective::Variable( - linking, - compiled_name, - resolve_initializers(id_defs, var)?, - ) + TranslationDirective::Variable(linking, compiled_name, var) } TranslationDirective::Method(method) => { let body = match method.body { @@ -1461,9 +1464,7 @@ fn resolve_instruction_types_method<'input>( } }, Statement::Label(i) => result.push(Statement::Label(i)), - Statement::Variable(v) => { - result.push(Statement::Variable(resolve_initializers(id_defs, v)?)) - } + Statement::Variable(v) => result.push(Statement::Variable(v)), Statement::Conditional(c) => result.push(Statement::Conditional(c)), _ => return Err(TranslateError::unreachable()), } @@ -1471,39 +1472,6 @@ fn resolve_instruction_types_method<'input>( Ok(result) } -fn resolve_initializers<'input>( - id_defs: &mut IdNameMapBuilder<'input>, - mut v: Variable, -) -> Result { - fn resolve_initializer_impl<'input>( - id_defs: &mut IdNameMapBuilder<'input>, - init: &mut ast::Initializer, - ) -> Result<(), TranslateError> { - match init { - ast::Initializer::Constant(_) => {} - ast::Initializer::Global(name, type_) - | ast::Initializer::GenericGlobal(name, type_) => { - let (src_type, _, _, _) = id_defs.get_typed(*name)?; - *type_ = src_type; - } - ast::Initializer::Add(subinit) => { - resolve_initializer_impl(id_defs, &mut (*subinit).0)?; - resolve_initializer_impl(id_defs, &mut (*subinit).1)?; - } - ast::Initializer::Array(inits) => { - for init in inits.iter_mut() { - resolve_initializer_impl(id_defs, init)?; - } - } - } - Ok(()) - } - if let Some(ref mut init) = v.initializer { - resolve_initializer_impl(id_defs, init)?; - } - Ok(v) -} - // TODO: All this garbage should be replaced with proper constant propagation or // at least ability to visit statements without moving them struct KernelConstantsVisitor { @@ -2770,9 +2738,14 @@ fn replace_instructions_with_builtins_impl<'input>( } Statement::Instruction(ast::Instruction::Tex(tex, arg)) => { let geometry = tex.geometry.as_ptx(); + let op_name = if arg.lod.is_none() { + "tex" + } else { + "tex_level" + }; let fn_name = [ ZLUDA_PTX_PREFIX, - "tex", + op_name, tex.suffix(), "_", geometry, @@ -3340,6 +3313,7 @@ fn to_llvm_module_impl2<'a, 'input>( if let Some(ref mut raytracing_state) = raytracing { translation_module = raytracing::run_on_normalized(translation_module, raytracing_state)?; } + let translation_module = return_from_noreturn(translation_module); let translation_module = extract_builtin_functions(translation_module); let translation_module = resolve_instruction_types(translation_module, functions)?; let mut translation_module = restructure_function_return_types(translation_module)?; @@ -3385,6 +3359,32 @@ fn to_llvm_module_impl2<'a, 'input>( }) } +// In PTX it's legal to have a function like this: +// .func noreturn(.param .b64 noreturn_0) +// .noreturn +// { +// } +// Which trips up LLVM. We normalize this by inserting `ret;` +fn return_from_noreturn( + mut translation_module: TranslationModule, +) -> TranslationModule { + for directive in translation_module.directives.iter_mut() { + match directive { + TranslationDirective::Method(method) => { + if let Some(ref mut body) = method.body { + if body.is_empty() && method.tuning.contains(&ast::TuningDirective::Noreturn) { + body.push(Statement::Instruction(ast::Instruction::Ret( + ast::RetData { uniform: false }, + ))); + } + } + } + TranslationDirective::Variable(..) => {} + } + } + translation_module +} + // PTX definition of param state space does not translate cleanly into AMDGPU notion of an address space: //  .param in kernel arguments matches AMDGPU constant address space // .param in function arguments and variables matches AMDGPU private address space @@ -3536,7 +3536,8 @@ fn create_metadata<'input>( match tuning { // TODO: measure ast::TuningDirective::MaxNReg(_) - | ast::TuningDirective::MinNCtaPerSm(_) => {} + | ast::TuningDirective::MinNCtaPerSm(_) + | ast::TuningDirective::Noreturn => {} ast::TuningDirective::MaxNtid(x, y, z) => { let size = x as u64 * y as u64 * z as u64; kernel_metadata.push(( @@ -3582,7 +3583,8 @@ fn insert_compilation_mode_prologue<'input>( for t in tuning.iter_mut() { match t { ast::TuningDirective::MaxNReg(_) - | ast::TuningDirective::MinNCtaPerSm(_) => {} + | ast::TuningDirective::MinNCtaPerSm(_) + | ast::TuningDirective::Noreturn => {} ast::TuningDirective::MaxNtid(_, _, z) => { *z *= 2; } @@ -6462,9 +6464,17 @@ impl ast::Instruction { ast::Instruction::Tex(details, arg) } ast::Instruction::Suld(details, arg) => { + let image_type_space = if details.direct { + (ast::Type::Texref, ast::StateSpace::Global) + } else { + ( + ast::Type::Scalar(ast::ScalarType::B64), + ast::StateSpace::Reg, + ) + }; let arg = arg.map( visitor, - (ast::Type::Surfref, ast::StateSpace::Global), + image_type_space, details.geometry, details.value_type(), ast::ScalarType::B32, @@ -6550,6 +6560,15 @@ impl ast::Instruction { ast::Instruction::Sad(type_, a) => { ast::Instruction::Sad(type_, a.map(visitor, &ast::Type::Scalar(type_), false)?) } + ast::Instruction::Isspacep(space, arg) => ast::Instruction::Isspacep( + space, + arg.map_different_types( + visitor, + &ast::Type::Scalar(ast::ScalarType::Pred), + &ast::Type::Scalar(ast::ScalarType::U64), + )?, + ), + }) } } @@ -6862,6 +6881,7 @@ impl ast::Instruction { ast::Instruction::Vshr { .. } => None, ast::Instruction::Dp4a { .. } => None, ast::Instruction::MatchAny { .. } => None, + ast::Instruction::Isspacep { .. } => None, ast::Instruction::Sub(ast::ArithDetails::Signed(_), _) => None, ast::Instruction::Sub(ast::ArithDetails::Unsigned(_), _) => None, ast::Instruction::Add(ast::ArithDetails::Signed(_), _) => None, @@ -7999,7 +8019,7 @@ fn texture_geometry_to_vec_length(geometry: ast::TextureGeometry) -> u8 { } } -impl ast::Arg4Tex { +impl ast::Arg5Tex { fn map>( self, visitor: &mut V, @@ -8007,7 +8027,7 @@ impl ast::Arg4Tex { geometry: ast::TextureGeometry, value_type: ast::Type, coordinate_type: ast::ScalarType, - ) -> Result, TranslateError> { + ) -> Result, TranslateError> { let dst = visitor.operand( ArgumentDescriptor { op: self.dst, @@ -8054,11 +8074,27 @@ impl ast::Arg4Tex { &ast::Type::Vector(coordinate_type, coord_length), ast::StateSpace::Reg, )?; - Ok(ast::Arg4Tex { + let lod = self + .lod + .map(|lod| { + visitor.operand( + ArgumentDescriptor { + op: lod, + is_dst: false, + is_memory_access: false, + non_default_implicit_conversion: None, + }, + &ast::Type::Scalar(coordinate_type), + ast::StateSpace::Reg, + ) + }) + .transpose()?; + Ok(ast::Arg5Tex { dst, image, layer, coordinates, + lod, }) } } @@ -8069,6 +8105,14 @@ impl ast::Arg4Sust { visitor: &mut V, details: &ast::SurfaceDetails, ) -> Result, TranslateError> { + let (type_, space) = if details.direct { + (ast::Type::Surfref, ast::StateSpace::Global) + } else { + ( + ast::Type::Scalar(ast::ScalarType::B64), + ast::StateSpace::Reg, + ) + }; let image = visitor.operand( ArgumentDescriptor { op: self.image, @@ -8076,8 +8120,8 @@ impl ast::Arg4Sust { is_memory_access: false, non_default_implicit_conversion: None, }, - &ast::Type::Surfref, - ast::StateSpace::Global, + &type_, + space, )?; let layer = self .layer diff --git a/zluda/src/cuda.rs b/zluda/src/cuda.rs index 1f37dbfe..f8a05848 100644 --- a/zluda/src/cuda.rs +++ b/zluda/src/cuda.rs @@ -214,6 +214,9 @@ cuda_function_declarations!( cuLinkComplete, cuLinkDestroy, cuLinkCreate_v2, + cuMipmappedArrayCreate, + cuMipmappedArrayDestroy, + cuMipmappedArrayGetLevel ] ); @@ -1242,9 +1245,7 @@ mod definitions { surface::create(pSurfObject, pResDesc) } - pub(crate) unsafe fn cuSurfObjectDestroy( - surfObject: hipSurfaceObject_t, - ) -> hipError_t { + pub(crate) unsafe fn cuSurfObjectDestroy(surfObject: hipSurfaceObject_t) -> hipError_t { surface::destroy(surfObject) } @@ -1253,7 +1254,7 @@ mod definitions { pResDesc: *const CUDA_RESOURCE_DESC, pTexDesc: *const HIP_TEXTURE_DESC, pResViewDesc: *const HIP_RESOURCE_VIEW_DESC, - ) -> hipError_t { + ) -> Result<(), CUresult> { texobj::create(pTexObject, pResDesc, pTexDesc, pResViewDesc) } @@ -1652,4 +1653,26 @@ mod definitions { ) -> Result<(), CUresult> { link::create(numOptions, options, optionValues, stateOut) } + + pub(crate) unsafe fn cuMipmappedArrayCreate( + pHandle: *mut CUmipmappedArray, + pMipmappedArrayDesc: *const HIP_ARRAY3D_DESCRIPTOR, + numMipmapLevels: ::std::os::raw::c_uint, + ) -> Result<(), CUresult> { + array::mipmapped_create(pHandle, pMipmappedArrayDesc, numMipmapLevels) + } + + pub(crate) unsafe fn cuMipmappedArrayDestroy( + hMipmappedArray: CUmipmappedArray, + ) -> hipError_t { + array::mipmapped_destroy(hMipmappedArray) + } + + pub(crate) unsafe fn cuMipmappedArrayGetLevel( + pLevelArray: *mut CUarray, + hMipmappedArray: CUmipmappedArray, + level: ::std::os::raw::c_uint, + ) -> Result<(), CUresult> { + array::mipmapped_get_level(pLevelArray, hMipmappedArray, level) + } } diff --git a/zluda/src/impl/array.rs b/zluda/src/impl/array.rs index ab2db78f..4acbb7d1 100644 --- a/zluda/src/impl/array.rs +++ b/zluda/src/impl/array.rs @@ -47,12 +47,13 @@ pub(crate) unsafe fn get_descriptor_3d( flags |= CUDA_ARRAY3D_SURFACE_LDST; let array = hipfix::array::get(array); if let (Some(array), Some(array_descriptor)) = (array.as_ref(), array_descriptor.as_mut()) { + let real_format = hipfix::get_broken_format(array).unwrap_or(array.Format); *array_descriptor = CUDA_ARRAY3D_DESCRIPTOR { Width: array.width as usize, Height: array.height as usize, Depth: array.depth as usize, NumChannels: array.NumChannels, - Format: mem::transmute(array.Format), // compatible + Format: mem::transmute(real_format), // compatible Flags: flags, }; hipError_t::hipSuccess @@ -81,3 +82,65 @@ pub(crate) unsafe fn create( Err(CUresult::CUDA_ERROR_INVALID_VALUE) } } + +pub(crate) unsafe fn mipmapped_create( + mipmapped_array: *mut CUmipmappedArray, + mipmapped_array_desc: *const HIP_ARRAY3D_DESCRIPTOR, + num_mipmap_levels: u32, +) -> Result<(), CUresult> { + if let Some(mipmapped_array_desc) = (mipmapped_array_desc).as_ref() { + let mut mipmapped_array_desc = *mipmapped_array_desc; + let (hack_flag, format) = hipfix::get_non_broken_format(mipmapped_array_desc.Format); + mipmapped_array_desc.Format = format; + let mut hip_array = ptr::null_mut(); + hip_call_cuda!(hipMipmappedArrayCreate( + &mut hip_array, + &mut mipmapped_array_desc, + num_mipmap_levels + )); + if (hip_array as usize & 0b11) != 0 { + hip_call_cuda!(hipMipmappedArrayDestroy(hip_array)); + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); + } + hip_array = (hip_array as usize | hack_flag as usize) as _; + *mipmapped_array = hip_array.cast(); + Ok(()) + } else { + Err(CUresult::CUDA_ERROR_INVALID_VALUE) + } +} + +pub(crate) unsafe fn mipmapped_destroy(mipmapped_array: CUmipmappedArray) -> hipError_t { + let mipmapped_array = hipfix::array::get_mipmapped(mipmapped_array).0; + hipMipmappedArrayDestroy(mipmapped_array) +} + +pub(crate) unsafe fn mipmapped_get_level( + level_array: *mut CUarray, + mipmapped_array: CUmipmappedArray, + level: u32, +) -> Result<(), CUresult> { + let (mipmapped_array, hack_flag) = hipfix::array::get_mipmapped(mipmapped_array); + if let Some(mipmapped_array) = mipmapped_array.as_mut() { + let mut hip_array = mem::zeroed(); + hip_call_cuda!(hipMipmappedArrayGetLevel( + &mut hip_array, + mipmapped_array as *mut _, + level + )); + let hip_array_mut = hip_array.as_mut().ok_or(CUresult::CUDA_ERROR_UNKNOWN)?; + hip_array_mut.textureType = hack_flag; + if mipmapped_array.height == 0 { + // HIP returns 1 here for no good reason + hip_array_mut.height = 0; + } + if mipmapped_array.depth == 0 { + // HIP returns 1 here for no good reason + hip_array_mut.depth = 0; + } + *level_array = mem::transmute(hip_array); + Ok(()) + } else { + Err(CUresult::CUDA_ERROR_INVALID_VALUE) + } +} diff --git a/zluda/src/impl/hipfix.rs b/zluda/src/impl/hipfix.rs index 77fec003..3257d97d 100644 --- a/zluda/src/impl/hipfix.rs +++ b/zluda/src/impl/hipfix.rs @@ -3,6 +3,8 @@ use cuda_types::*; use hip_runtime_sys::*; use std::{env, ptr}; +use self::array::get_mipmapped; + use super::{function::FunctionData, stream, LiveCheck}; // For some reason HIP does not tolerate hipArraySurfaceLoadStore, even though @@ -26,8 +28,24 @@ pub(crate) fn get_non_broken_format(format: hipArray_Format) -> (u32, hipArray_F } #[must_use] -pub(crate) fn get_broken_format(broken: u32, format: hipArray_Format) -> hipArray_Format { - match (broken, format) { +pub(crate) fn get_broken_format(array: &hipArray) -> Option { + get_broken_format_impl(array.textureType, array.Format) +} + +#[must_use] +pub(crate) unsafe fn get_broken_format_mipmapped( + array: CUmipmappedArray, +) -> Result<(&'static hipMipmappedArray, Option), CUresult> { + let (hip_array, flag) = get_mipmapped(array); + let hip_array_ref = hip_array + .as_ref() + .ok_or(CUresult::CUDA_ERROR_INVALID_VALUE)?; + let format_override = get_broken_format_impl(flag, hip_array_ref.format); + Ok((hip_array_ref, format_override)) +} + +fn get_broken_format_impl(hack_flag: u32, format: hipArray_Format) -> Option { + Some(match (hack_flag, format) { (2, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16) => hipArray_Format::HIP_AD_FORMAT_HALF, (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16) => { hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 @@ -35,13 +53,14 @@ pub(crate) fn get_broken_format(broken: u32, format: hipArray_Format) -> hipArra (1, hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8) => { hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 } - (_, f) => f, - } + (_, _) => return None, + }) } // memcpy3d fails when copying array1d arrays, so we mark all layered arrays by // settings LSB pub(crate) mod array { + use super::{get_broken_format, get_broken_format_mipmapped}; use crate::{ hip_call_cuda, r#impl::{memcpy3d_from_cuda, memory_type_from_cuda, FromCuda}, @@ -51,23 +70,171 @@ pub(crate) mod array { use std::{mem, ptr}; pub(crate) unsafe fn with_resource_desc( - cuda: *const CUDA_RESOURCE_DESC, - fn_: impl FnOnce(*const HIP_RESOURCE_DESC) -> T, - ) -> T { - let cuda = &*cuda; + res_desc: *const CUDA_RESOURCE_DESC, + res_desc_view: *const HIP_RESOURCE_VIEW_DESC, + fn_: impl FnOnce(*const HIP_RESOURCE_DESC, *const HIP_RESOURCE_VIEW_DESC) -> T, + ) -> Result { + let cuda = &*res_desc; if cuda.resType == CUresourcetype::CU_RESOURCE_TYPE_ARRAY { let mut cuda = *cuda; - cuda.res.array.hArray = mem::transmute(get(cuda.res.array.hArray)); - fn_((&cuda as *const CUDA_RESOURCE_DESC).cast::()) + let hip_array = get(cuda.res.array.hArray); + cuda.res.array.hArray = mem::transmute(hip_array); + if let Some(hip_array) = hip_array.as_ref() { + if let Some(new_format) = get_broken_format(hip_array) { + return if res_desc_view == ptr::null() { + let res_desc_view = HIP_RESOURCE_VIEW_DESC { + format: resource_view_format(new_format, hip_array.NumChannels)?, + width: hip_array.width as usize, + height: hip_array.height as usize, + depth: hip_array.depth as usize, + firstMipmapLevel: 0, + lastMipmapLevel: 0, + firstLayer: 0, + lastLayer: 0, + reserved: mem::zeroed(), + }; + Ok(fn_( + (&cuda as *const CUDA_RESOURCE_DESC).cast::(), + &res_desc_view, + )) + } else { + Err(CUresult::CUDA_ERROR_NOT_SUPPORTED) + }; + } + } + Ok(fn_( + (&cuda as *const CUDA_RESOURCE_DESC).cast::(), + res_desc_view, + )) + } else if cuda.resType == CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY { + let (hip_mipmapped_array, format_override) = + get_broken_format_mipmapped(cuda.res.mipmap.hMipmappedArray)?; + let mut cuda = *cuda; + cuda.res.mipmap.hMipmappedArray = mem::transmute(hip_mipmapped_array as *const _); + if let Some(new_format) = format_override { + return if res_desc_view == ptr::null() { + let res_desc_view = HIP_RESOURCE_VIEW_DESC { + format: resource_view_format(new_format, hip_mipmapped_array.num_channels)?, + width: hip_mipmapped_array.width as usize, + height: hip_mipmapped_array.height as usize, + depth: hip_mipmapped_array.depth as usize, + firstMipmapLevel: hip_mipmapped_array.min_mipmap_level, + lastMipmapLevel: hip_mipmapped_array.max_mipmap_level, + firstLayer: 0, + lastLayer: 0, + reserved: mem::zeroed(), + }; + Ok(fn_( + (&cuda as *const CUDA_RESOURCE_DESC).cast::(), + &res_desc_view, + )) + } else { + Err(CUresult::CUDA_ERROR_NOT_SUPPORTED) + }; + } + Ok(fn_( + (&cuda as *const CUDA_RESOURCE_DESC).cast::(), + res_desc_view, + )) } else { - fn_((cuda as *const CUDA_RESOURCE_DESC).cast::()) + Ok(fn_( + (cuda as *const CUDA_RESOURCE_DESC).cast::(), + res_desc_view, + )) } } + fn resource_view_format( + format: hipArray_Format, + num_channels: u32, + ) -> Result { + Ok(match (format, num_channels) { + (hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8, 1) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_UINT_1X8 + } + (hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8, 2) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_UINT_2X8 + } + (hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8, 4) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_UINT_4X8 + } + (hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8, 1) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_SINT_1X8 + } + (hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8, 2) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_SINT_2X8 + } + (hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8, 4) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_SINT_4X8 + } + (hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16, 1) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_UINT_1X16 + } + (hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16, 2) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_UINT_2X16 + } + (hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16, 4) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_UINT_4X16 + } + (hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16, 1) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_SINT_1X16 + } + (hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16, 2) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_SINT_2X16 + } + (hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16, 4) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_SINT_4X16 + } + (hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32, 1) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_UINT_1X32 + } + (hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32, 2) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_UINT_2X32 + } + (hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32, 4) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_UINT_4X32 + } + (hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32, 1) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_SINT_1X32 + } + (hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32, 2) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_SINT_2X32 + } + (hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32, 4) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_SINT_4X32 + } + (hipArray_Format::HIP_AD_FORMAT_HALF, 1) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_FLOAT_1X16 + } + (hipArray_Format::HIP_AD_FORMAT_HALF, 2) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_FLOAT_2X16 + } + (hipArray_Format::HIP_AD_FORMAT_HALF, 4) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_FLOAT_4X16 + } + (hipArray_Format::HIP_AD_FORMAT_FLOAT, 1) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_FLOAT_1X32 + } + (hipArray_Format::HIP_AD_FORMAT_FLOAT, 2) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_FLOAT_2X32 + } + (hipArray_Format::HIP_AD_FORMAT_FLOAT, 4) => { + HIPresourceViewFormat::HIP_RES_VIEW_FORMAT_FLOAT_4X32 + } + _ => return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), + }) + } + pub(crate) fn get(cuda: CUarray) -> hipArray_t { (cuda as usize & !3usize) as hipArray_t } + pub(crate) fn get_mipmapped(cuda: CUmipmappedArray) -> (hipMipmappedArray_t, u32) { + let array = (cuda as usize & !3usize) as hipMipmappedArray_t; + let broken_flag = (cuda as usize & 3usize) as u32; + (array, broken_flag) + } + pub(crate) fn to_cuda(array: hipArray_t, layered_dims: usize) -> CUarray { let a1d_layered = layered_dims as usize; ((array as usize) | a1d_layered) as CUarray diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs index 34566af5..73c6efa5 100644 --- a/zluda/src/impl/mod.rs +++ b/zluda/src/impl/mod.rs @@ -220,6 +220,7 @@ impl FromCuda for CUlibraryOption {} impl FromCuda for CUDA_KERNEL_NODE_PARAMS_v1 {} impl FromCuda for CUjitInputType {} impl FromCuda for CUDA_RESOURCE_DESC {} +impl FromCuda for CUmipmappedArray {} impl FromCuda for *mut context::Context {} impl FromCuda for *mut stream::Stream {} diff --git a/zluda/src/impl/texobj.rs b/zluda/src/impl/texobj.rs index 21eb4530..0096c747 100644 --- a/zluda/src/impl/texobj.rs +++ b/zluda/src/impl/texobj.rs @@ -1,19 +1,29 @@ +use super::hipfix; +use crate::hip_call_cuda; use cuda_types::*; use hip_runtime_sys::*; use std::ptr; -use super::hipfix; - pub(crate) unsafe fn create( p_tex_object: *mut hipTextureObject_t, p_res_desc: *const CUDA_RESOURCE_DESC, p_tex_desc: *const HIP_TEXTURE_DESC, p_res_view_desc: *const HIP_RESOURCE_VIEW_DESC, -) -> hipError_t { +) -> Result<(), CUresult> { if p_res_desc == ptr::null() { - return hipError_t::hipErrorInvalidValue; + return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - hipfix::array::with_resource_desc(p_res_desc, |p_res_desc| { - hipTexObjectCreate(p_tex_object, p_res_desc, p_tex_desc, p_res_view_desc) - }) + hipfix::array::with_resource_desc( + p_res_desc, + p_res_view_desc, + |p_res_desc, p_res_view_desc| { + hip_call_cuda!(hipTexObjectCreate( + p_tex_object, + p_res_desc, + p_tex_desc, + p_res_view_desc + )); + Ok(()) + }, + )? } diff --git a/zluda/src/impl/texref.rs b/zluda/src/impl/texref.rs index 307b5bad..b72de09b 100644 --- a/zluda/src/impl/texref.rs +++ b/zluda/src/impl/texref.rs @@ -94,7 +94,7 @@ pub(crate) unsafe fn set_array( if let Some(array) = array.as_ref() { hip_call_cuda!(hipTexRefSetFormat( texref, - hipfix::get_broken_format(array.textureType, array.Format), + hipfix::get_broken_format(array).unwrap_or(array.Format), array.NumChannels as i32, )); hip_call_cuda!(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT)); diff --git a/zluda/tests/kernel_suld.rs b/zluda/tests/kernel_suld.rs index 07fc5606..7d368a26 100644 --- a/zluda/tests/kernel_suld.rs +++ b/zluda/tests/kernel_suld.rs @@ -377,7 +377,10 @@ unsafe fn kernel_suld_impl< let x = random_size.sample(&mut rng) * sizeof_pixel; let y = random_size.sample(&mut rng); let z = random_size.sample(&mut rng); - let values = [rng.gen::(); SULD_N]; + let mut values = [SustType::default(); SULD_N]; + for value in values.iter_mut() { + *value = rng.gen::(); + } let converted_values = force_transmute(values, BYTE_FILLER3); *host_side_data.get_unchecked_mut(geo.address(size, x, y, z, sizeof_pixel)) = converted_values; assert_eq!( diff --git a/zluda/tests/kernel_sust.rs b/zluda/tests/kernel_sust.rs index 5057b563..e6a07def 100644 --- a/zluda/tests/kernel_sust.rs +++ b/zluda/tests/kernel_sust.rs @@ -419,7 +419,10 @@ unsafe fn kernel_sust_impl< let x = random_size.sample(&mut rng) * sizeof_pixel; let y = random_size.sample(&mut rng); let z = random_size.sample(&mut rng); - let values = [rng.gen::(); SUST_N]; + let mut values = [SustType::default(); SUST_N]; + for value in values.iter_mut() { + *value = rng.gen::(); + } let mut args = vec![ &x as *const _ as *const c_void, &y as *const _ as *const _, diff --git a/zluda/tests/kernel_tex.rs b/zluda/tests/kernel_tex.rs index 6b2d1d30..88e3c4ba 100644 --- a/zluda/tests/kernel_tex.rs +++ b/zluda/tests/kernel_tex.rs @@ -213,6 +213,7 @@ generate_tests!( CU_AD_FORMAT_SIGNED_INT8, CU_AD_FORMAT_SIGNED_INT16, CU_AD_FORMAT_SIGNED_INT32, + // TODO: update half crate //CU_AD_FORMAT_HALF, CU_AD_FORMAT_FLOAT ], @@ -337,13 +338,13 @@ const BYTE_FILLER2: u8 = 0xfe; unsafe fn force_transmute(f: From) -> To { if mem::size_of::() == mem::size_of::() - && mem::size_of::() == mem::size_of::() + && mem::size_of::() == mem::size_of::() { return mem::transmute_copy(&f); } - if mem::size_of::() == mem::size_of::() { + if mem::size_of::() == mem::size_of::() { if let Some(value) = ::downcast_ref::(&f) { - return mem::transmute_copy(&((value.to_f64() / f16::MAX.to_f64()) as f32)); + return mem::transmute_copy(&value.to_f32()); } if let Some(value) = ::downcast_ref::(&f) { return mem::transmute_copy(&((*value as f64 / u8::MAX as f64) as f32)); @@ -359,6 +360,9 @@ unsafe fn force_transmute(f: From) -> To { } } if mem::size_of::() == mem::size_of::() { + if let Some(_) = ::downcast_ref::(&f) { + return mem::transmute_copy(&f); + } if let Some(value) = ::downcast_ref::(&f) { return mem::transmute_copy(&f16::from_f64(*value as f64 / u8::MAX as f64)); } diff --git a/zluda/tests/linking.rs b/zluda/tests/linking.rs index 025d8bab..57ada55f 100644 --- a/zluda/tests/linking.rs +++ b/zluda/tests/linking.rs @@ -229,16 +229,6 @@ impl Directive { Directive::Shared => unimplemented!(), } } - - fn assert_exact(self) -> bool { - match self { - Directive::Kernel => false, - Directive::Method => true, - Directive::Global => false, - Directive::Const => false, - Directive::Shared => unimplemented!(), - } - } } #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -370,45 +360,6 @@ fn create_kernel(linking: Linking, directive: Directive, defined: bool) -> Strin kernel } -fn assert_compatible( - results: Vec<(Linking, Directive, bool, i32, Option)>, - expected: [(Linking, Directive, bool, i32, Option); 50], -) { - if results.len() != expected.len() { - panic!(); - } - let mut broken = Vec::new(); - for (result, expected) in results.into_iter().zip(IntoIterator::into_iter(expected)) { - let (linking, directive, defined, build_result, load_result) = result; - let (_, _, _, expected_build, expected_load) = expected; - if expected_build == 0 { - if build_result != 0 { - broken.push(( - linking, - directive, - defined, - (build_result, load_result), - (expected_build, expected_load), - )); - continue; - } - if expected_load == Some(0) { - if load_result != Some(0) { - broken.push(( - linking, - directive, - defined, - (build_result, load_result), - (expected_build, expected_load), - )); - continue; - } - } - } - } - assert_eq!(broken, []); -} - fn assert_compatible_compile( compiled: &[T], compiled_expected: &[T], @@ -1107,3 +1058,28 @@ unsafe fn emit_weak_fn(cuda: T) { CUresult::CUDA_SUCCESS ); } + + +cuda_driver_test!(static_entry_decl); + +unsafe fn static_entry_decl(cuda: T) { + let input1 = " + .version 6.5 + .target sm_35 + .address_size 64 + + .entry foobar(); + .entry foobar() { ret; }\0" + .to_string(); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = mem::zeroed(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, input1.as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); +} diff --git a/zluda/tests/mipmap_array.ptx b/zluda/tests/mipmap_array.ptx new file mode 100644 index 00000000..9dbfd2b8 --- /dev/null +++ b/zluda/tests/mipmap_array.ptx @@ -0,0 +1,47 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.entry texture_to_surface( + .param .u64 texture_param, + .param .u64 surface_param +) +{ + .reg .u64 texture; + .reg .u64 surface; + .reg .f32 f<5>; + .reg .b16 rs<5>; + + ld.param.u64 texture, [texture_param]; + ld.param.u64 surface, [surface_param]; + + tex.2d.v4.f32.f32 {f1, f2, f3, f4}, [texture, {0f00000000, 0f00000000}]; + { cvt.rn.f16.f32 rs1, f1;} + { cvt.rn.f16.f32 rs2, f2;} + { cvt.rn.f16.f32 rs3, f3;} + { cvt.rn.f16.f32 rs4, f4;} + sust.b.2d.v4.b16.trap [surface, {0, 0}], {rs1, rs2, rs3, rs4}; + ret; +} + + +.entry read_tex_2d_mip( + .param .u64 texture_param, + .param .u64 output_param +) +{ + .reg .u64 texture; + .reg .u64 output; + .reg .f32 f<5>; + + ld.param.u64 texture, [texture_param]; + ld.param.u64 output, [output_param]; + + // 3F800000 = 1.0 + tex.level.2d.v4.f32.f32 {f1, f2, f3, f4}, [texture, {0f00000000, 0f00000000}], 0f3f800000; + st.global.f32 [output], f1; + st.global.f32 [output+4], f2; + st.global.f32 [output+8], f3; + st.global.f32 [output+12], f4; + ret; +} diff --git a/zluda/tests/mipmap_array.rs b/zluda/tests/mipmap_array.rs new file mode 100644 index 00000000..998f81e7 --- /dev/null +++ b/zluda/tests/mipmap_array.rs @@ -0,0 +1,431 @@ +use crate::common::CudaDriverFns; +use cuda_types::*; +use half::f16; +use std::{ffi::c_void, mem, ptr}; + +mod common; + +// TODO: These two tests expose various random brokenness of mipmapped array +// and texture objects. This should be turned into extensive tests like +// kernel_sust/kernel_suld/kernel_tex + +cuda_driver_test!(mipmap_texture_to_surface); + +unsafe fn mipmap_texture_to_surface(cuda: T) { + let kernel = include_str!("mipmap_array.ptx"); + let mut kernel = kernel.to_owned(); + kernel.push('\0'); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = ptr::null_mut(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut mipmap_array = ptr::null_mut(); + let mipmap_desc = CUDA_ARRAY3D_DESCRIPTOR { + Width: 1368, + Height: 770, + Depth: 0, + Format: CUarray_format::CU_AD_FORMAT_HALF, + NumChannels: 4, + Flags: 0, + }; + assert_eq!( + cuda.cuMipmappedArrayCreate(&mut mipmap_array, &mipmap_desc, 8), + CUresult::CUDA_SUCCESS + ); + let mut array_0 = mem::zeroed(); + let mut array_1 = mem::zeroed(); + assert_eq!( + cuda.cuMipmappedArrayGetLevel(&mut array_0, mipmap_array, 0), + CUresult::CUDA_SUCCESS + ); + let mut queried_descriptor = mem::zeroed(); + assert_eq!( + cuda.cuArray3DGetDescriptor_v2(&mut queried_descriptor, array_0), + CUresult::CUDA_SUCCESS + ); + assert_eq!(mipmap_desc.Depth, queried_descriptor.Depth); + assert_eq!( + cuda.cuMipmappedArrayGetLevel(&mut array_1, mipmap_array, 1), + CUresult::CUDA_SUCCESS + ); + let mut pixels = [0x3C66u16, 0x4066, 0x4299, 4466]; + let memcpy_from_host = CUDA_MEMCPY2D { + srcXInBytes: 0, + srcY: 0, + srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST, + srcHost: pixels.as_mut_ptr() as _, + srcDevice: CUdeviceptr_v2(ptr::null_mut()), + srcArray: ptr::null_mut(), + srcPitch: 4 * mem::size_of::(), + dstXInBytes: 0, + dstY: 0, + dstMemoryType: CUmemorytype::CU_MEMORYTYPE_ARRAY, + dstHost: ptr::null_mut(), + dstDevice: CUdeviceptr_v2(ptr::null_mut()), + dstArray: array_0, + dstPitch: 0, + WidthInBytes: 4 * mem::size_of::(), + Height: 1, + }; + assert_eq!( + cuda.cuMemcpy2DUnaligned_v2(&memcpy_from_host), + CUresult::CUDA_SUCCESS + ); + let mut texture = mem::zeroed(); + let texture_resource_desc = CUDA_RESOURCE_DESC { + resType: CUresourcetype::CU_RESOURCE_TYPE_ARRAY, + res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 { + array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { hArray: array_0 }, + }, + flags: 0, + }; + let texture_desc = CUDA_TEXTURE_DESC { + addressMode: [ + CUaddress_mode::CU_TR_ADDRESS_MODE_CLAMP, + CUaddress_mode::CU_TR_ADDRESS_MODE_CLAMP, + CUaddress_mode::CU_TR_ADDRESS_MODE_CLAMP, + ], + filterMode: CUfilter_mode::CU_TR_FILTER_MODE_LINEAR, + flags: 2, + maxAnisotropy: 0, + mipmapFilterMode: CUfilter_mode::CU_TR_FILTER_MODE_POINT, + mipmapLevelBias: 0f32, + minMipmapLevelClamp: 0f32, + maxMipmapLevelClamp: 0f32, + borderColor: [0f32, 0f32, 0f32, 0f32], + reserved: mem::zeroed(), + }; + assert_eq!( + cuda.cuTexObjectCreate( + &mut texture, + &texture_resource_desc, + &texture_desc, + ptr::null() + ), + CUresult::CUDA_SUCCESS + ); + let mut surface = mem::zeroed(); + let surface_resource_desc = CUDA_RESOURCE_DESC { + resType: CUresourcetype::CU_RESOURCE_TYPE_ARRAY, + res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 { + array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { hArray: array_1 }, + }, + flags: 0, + }; + assert_eq!( + cuda.cuSurfObjectCreate(&mut surface, &surface_resource_desc), + CUresult::CUDA_SUCCESS + ); + let mut texture_to_surface = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetFunction( + &mut texture_to_surface, + module, + b"texture_to_surface\0".as_ptr().cast() + ), + CUresult::CUDA_SUCCESS + ); + let mut params = [&mut texture, &mut surface]; + assert_eq!( + cuda.cuLaunchKernel( + texture_to_surface, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + params.as_mut_ptr().cast(), + ptr::null_mut(), + ), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuStreamSynchronize(ptr::null_mut()), + CUresult::CUDA_SUCCESS + ); + let mut memcpy_dst = [u16::MAX; 4]; + let memcpy_to_host = CUDA_MEMCPY2D { + srcXInBytes: 0, + srcY: 0, + srcMemoryType: CUmemorytype::CU_MEMORYTYPE_ARRAY, + srcHost: ptr::null(), + srcDevice: CUdeviceptr_v2(ptr::null_mut()), + srcArray: array_1, + srcPitch: 0, + dstXInBytes: 0, + dstY: 0, + dstMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST, + dstHost: memcpy_dst.as_mut_ptr() as _, + dstDevice: CUdeviceptr_v2(ptr::null_mut()), + dstArray: ptr::null_mut(), + dstPitch: 4 * mem::size_of::(), + WidthInBytes: 4 * mem::size_of::(), + Height: 1, + }; + assert_eq!( + cuda.cuMemcpy2DUnaligned_v2(&memcpy_to_host), + CUresult::CUDA_SUCCESS + ); + assert_eq!(&pixels, &memcpy_dst); + let texture_resource_desc = CUDA_RESOURCE_DESC { + resType: CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, + res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 { + mipmap: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_2 { + hMipmappedArray: mipmap_array, + }, + }, + flags: 0, + }; + let texture_desc = CUDA_TEXTURE_DESC { + addressMode: [ + CUaddress_mode::CU_TR_ADDRESS_MODE_CLAMP, + CUaddress_mode::CU_TR_ADDRESS_MODE_CLAMP, + CUaddress_mode::CU_TR_ADDRESS_MODE_CLAMP, + ], + filterMode: CUfilter_mode::CU_TR_FILTER_MODE_LINEAR, + flags: 2, + maxAnisotropy: 0, + mipmapFilterMode: CUfilter_mode::CU_TR_FILTER_MODE_LINEAR, + mipmapLevelBias: 0f32, + minMipmapLevelClamp: 0f32, + maxMipmapLevelClamp: 7f32, + borderColor: [0f32, 0f32, 0f32, 0f32], + reserved: mem::zeroed(), + }; + let mut mipmapped_tex_obj = mem::zeroed(); + assert_eq!( + cuda.cuTexObjectCreate( + &mut mipmapped_tex_obj, + &texture_resource_desc, + &texture_desc, + ptr::null() + ), + CUresult::CUDA_SUCCESS + ); + let mut read_tex_2d_mip = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetFunction( + &mut read_tex_2d_mip, + module, + b"read_tex_2d_mip\0".as_ptr().cast() + ), + CUresult::CUDA_SUCCESS + ); + let mut output_buffer = mem::zeroed(); + assert_eq!( + cuda.cuMemAlloc_v2(&mut output_buffer, 4 * mem::size_of::()), + CUresult::CUDA_SUCCESS + ); + let mut params = [ + &mut mipmapped_tex_obj as *mut _ as *mut c_void, + &mut output_buffer as *mut _ as *mut c_void, + ]; + assert_eq!( + cuda.cuLaunchKernel( + read_tex_2d_mip, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + params.as_mut_ptr().cast(), + ptr::null_mut(), + ), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuStreamSynchronize(ptr::null_mut()), + CUresult::CUDA_SUCCESS + ); + let mut output = [f32::MAX; 4]; + assert_eq!( + cuda.cuMemcpyDtoH_v2( + output.as_mut_ptr().cast(), + output_buffer, + 4 * mem::size_of::() + ), + CUresult::CUDA_SUCCESS + ); + let pixels_f32 = pixels + .iter() + .copied() + .map(|x| mem::transmute::<_, f16>(x).to_f32()) + .collect::>(); + assert_eq!(&output[..], &*pixels_f32); +} + +cuda_driver_test!(mipmap_texture_to_surface2); + +unsafe fn mipmap_texture_to_surface2(cuda: T) { + let kernel = include_str!("mipmap_array.ptx"); + let mut kernel = kernel.to_owned(); + kernel.push('\0'); + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxCreate_v2(&mut ctx, 0, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + let mut module = ptr::null_mut(); + assert_eq!( + cuda.cuModuleLoadData(&mut module, kernel.as_ptr() as _), + CUresult::CUDA_SUCCESS + ); + let mut array_0 = mem::zeroed(); + let mipmap_desc = CUDA_ARRAY3D_DESCRIPTOR { + Width: 1368, + Height: 770, + Depth: 0, + Format: CUarray_format::CU_AD_FORMAT_HALF, + NumChannels: 4, + Flags: 2, + }; + assert_eq!( + cuda.cuArray3DCreate_v2(&mut array_0, &mipmap_desc), + CUresult::CUDA_SUCCESS + ); + let mut array_1 = mem::zeroed(); + let mipmap_desc = CUDA_ARRAY3D_DESCRIPTOR { + Width: 1368 / 2, + Height: 770 / 2, + Depth: 0, + Format: CUarray_format::CU_AD_FORMAT_HALF, + NumChannels: 4, + Flags: 2, + }; + assert_eq!( + cuda.cuArray3DCreate_v2(&mut array_1, &mipmap_desc), + CUresult::CUDA_SUCCESS + ); + let mut pixels = [0x3C66u16, 0x4066, 0x4299, 4466]; + let memcpy_from_host = CUDA_MEMCPY2D { + srcXInBytes: 0, + srcY: 0, + srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST, + srcHost: pixels.as_mut_ptr() as _, + srcDevice: CUdeviceptr_v2(ptr::null_mut()), + srcArray: ptr::null_mut(), + srcPitch: 4 * mem::size_of::(), + dstXInBytes: 0, + dstY: 0, + dstMemoryType: CUmemorytype::CU_MEMORYTYPE_ARRAY, + dstHost: ptr::null_mut(), + dstDevice: CUdeviceptr_v2(ptr::null_mut()), + dstArray: array_0, + dstPitch: 0, + WidthInBytes: 4 * mem::size_of::(), + Height: 1, + }; + assert_eq!( + cuda.cuMemcpy2DUnaligned_v2(&memcpy_from_host), + CUresult::CUDA_SUCCESS + ); + let mut func = mem::zeroed(); + assert_eq!( + cuda.cuModuleGetFunction(&mut func, module, b"texture_to_surface\0".as_ptr().cast()), + CUresult::CUDA_SUCCESS + ); + let mut texture = mem::zeroed(); + let texture_resource_desc = CUDA_RESOURCE_DESC { + resType: CUresourcetype::CU_RESOURCE_TYPE_ARRAY, + res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 { + array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { hArray: array_0 }, + }, + flags: 0, + }; + let texture_desc = CUDA_TEXTURE_DESC { + addressMode: [ + CUaddress_mode::CU_TR_ADDRESS_MODE_CLAMP, + CUaddress_mode::CU_TR_ADDRESS_MODE_CLAMP, + CUaddress_mode::CU_TR_ADDRESS_MODE_CLAMP, + ], + filterMode: CUfilter_mode::CU_TR_FILTER_MODE_LINEAR, + flags: 2, + maxAnisotropy: 0, + mipmapFilterMode: CUfilter_mode::CU_TR_FILTER_MODE_POINT, + mipmapLevelBias: 0f32, + minMipmapLevelClamp: 0f32, + maxMipmapLevelClamp: 0f32, + borderColor: [0f32, 0f32, 0f32, 0f32], + reserved: mem::zeroed(), + }; + assert_eq!( + cuda.cuTexObjectCreate( + &mut texture, + &texture_resource_desc, + &texture_desc, + ptr::null() + ), + CUresult::CUDA_SUCCESS + ); + let mut surface = mem::zeroed(); + let surface_resource_desc = CUDA_RESOURCE_DESC { + resType: CUresourcetype::CU_RESOURCE_TYPE_ARRAY, + res: CUDA_RESOURCE_DESC_st__bindgen_ty_1 { + array: CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1 { hArray: array_1 }, + }, + flags: 0, + }; + assert_eq!( + cuda.cuSurfObjectCreate(&mut surface, &surface_resource_desc), + CUresult::CUDA_SUCCESS + ); + let mut params = [&mut texture, &mut surface]; + assert_eq!( + cuda.cuLaunchKernel( + func, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + ptr::null_mut(), + params.as_mut_ptr().cast(), + ptr::null_mut(), + ), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuStreamSynchronize(ptr::null_mut()), + CUresult::CUDA_SUCCESS + ); + let mut memcpy_dst = [u16::MAX; 4]; + let memcpy_to_host = CUDA_MEMCPY2D { + srcXInBytes: 0, + srcY: 0, + srcMemoryType: CUmemorytype::CU_MEMORYTYPE_ARRAY, + srcHost: ptr::null(), + srcDevice: CUdeviceptr_v2(ptr::null_mut()), + srcArray: array_1, + srcPitch: 0, + dstXInBytes: 0, + dstY: 0, + dstMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST, + dstHost: memcpy_dst.as_mut_ptr() as _, + dstDevice: CUdeviceptr_v2(ptr::null_mut()), + dstArray: ptr::null_mut(), + dstPitch: 4 * mem::size_of::(), + WidthInBytes: 4 * mem::size_of::(), + Height: 1, + }; + assert_eq!( + cuda.cuMemcpy2DUnaligned_v2(&memcpy_to_host), + CUresult::CUDA_SUCCESS + ); + assert_eq!(&pixels, &memcpy_dst); +}