diff --git a/README.md b/README.md index 52927d0b..5be6a8ad 100644 --- a/README.md +++ b/README.md @@ -215,10 +215,6 @@ Performance is currently much lower than the native HIP backend, see the discuss This is a ROCm/HIP bug. Currently, CompuBench tests have to be run one at a time. -- Some tests output black screen. - - This is due to a bug (or an unintended hardware feature) in CompuBench that just happens to work on NVIDIA GPUs. - #### V-Ray Benchmark - Currently, ZLUDA crashes when running V-Ray benchmark. Nonetheless, certain "lucky" older combinations of ZLUDA and ROCm/HIP are known to run V-Ray Benchmark successfully. diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index 48ea22b8..1edcbd5c 100644 Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 420ce651..ecbe691c 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -11,6 +11,7 @@ // https://llvm.org/docs/AMDGPUUsage.html #include +#include #include #define HIP_NO_HALF #include @@ -155,6 +156,399 @@ static __device__ float4::Native_vec_ __pack_to_float4(const T &t) return result; } +typedef uint32_t uint8 __attribute__((ext_vector_type(8))); +typedef uint32_t zluda_uint3 __attribute__((ext_vector_type(3))); +typedef uint8 CONSTANT_SPACE *surface_ptr; + +template +static __device__ To transmute(From f) +{ + if constexpr (sizeof(To) == sizeof(From)) + { + return std::bit_cast(f); + } + else if constexpr (sizeof(To) > sizeof(From)) + { + union + { + To t; + From f; + } u = {To{0}}; + u.f = f; + return u.t; + } + else if constexpr (sizeof(To) < sizeof(From)) + { + union + { + From f; + To t; + } u = {From{f}}; + return u.t; + } + else + { + static_assert(sizeof(To) == 0); + } +} + +enum class ImageGeometry +{ + _1D, + _2D, + _3D, + A1D, + A2D +}; + +// clang-format off +template struct Coordinates; +template <> struct Coordinates { using type = uint1::Native_vec_; }; +template <> struct Coordinates { using type = uint2::Native_vec_; }; +template <> struct Coordinates { using type = uint4::Native_vec_; }; +template <> struct Coordinates +{ + using type = uint2::Native_vec_; using arg_type = uint1::Native_vec_; + static __device__ type pack_layer(uint32_t layer, arg_type coord) + { + return type { coord.x, layer }; + } +}; +template <> struct Coordinates +{ + using type = zluda_uint3; using arg_type = uint2::Native_vec_; + static __device__ type pack_layer(uint32_t layer, arg_type coord) + { + return type { coord.x, coord.y, layer }; + } +}; +// clang-format on + +template +static __device__ void image_store_pck(T value, typename Coordinates::type coord, surface_ptr surface) +{ + if constexpr (sizeof(T) <= sizeof(uint)) + { + uint value_dword = transmute(value); + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:1D unorm" : : "v"(value_dword), "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:2D unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:3D unorm" : : "v"(value_dword), "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(T) == 0, "Invalid geometry"); + } + } + else if constexpr (sizeof(T) == sizeof(uint2::Native_vec_)) + { + uint2::Native_vec_ value_dword2 = transmute(value); + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:1D unorm" : : "v"(value_dword2), "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:2D unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:3D unorm" : : "v"(value_dword2), "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(T) == 0, "Invalid geometry"); + } + } + else if constexpr (sizeof(T) == sizeof(uint4::Native_vec_)) + { + uint4::Native_vec_ value_dword4 = transmute(value); + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:1D unorm" : : "v"(value_dword4), "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:2D unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:3D unorm" : : "v"(value_dword4), "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(T) == 0, "Invalid geometry"); + } + } + else + { + static_assert(sizeof(T) == 0, "Invalid vector size"); + } +} + +template +static __device__ T image_load_pck(typename Coordinates::type coord, surface_ptr surface) +{ + if constexpr (sizeof(T) <= sizeof(uint)) + { + uint data; + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry"); + } + return transmute(data); + } + else if constexpr (sizeof(T) == sizeof(uint2::Native_vec_)) + { + uint2::Native_vec_ data; + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry"); + } + return transmute(data); + } + else if constexpr (sizeof(T) == sizeof(uint4::Native_vec_)) + { + uint4::Native_vec_ data; + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry"); + } + return transmute(data); + } + else + { + static_assert(sizeof(T) == 0, "Invalid vector size"); + } +} + +template +static __device__ uint4::Native_vec_ image_load_pck_full(typename Coordinates::type coord, surface_ptr surface) +{ + uint4::Native_vec_ data; + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry"); + } + return data; +} + +template +static __device__ void image_store_pck_full_with(uint4::Native_vec_ data, T value, typename Coordinates::type coord, surface_ptr surface) +{ + // We avoid unions for types smaller than sizeof(uint32_t), + // because in those cases we get this garbage: + // ds_write_b128 v2, v[5:8] + // ds_write_b16 v2, v9 + // ds_read_b128 v[5:8], v2 + // tested with ROCm 5.7.1 on gfx1030 + if constexpr (sizeof(T) == sizeof(uint8_t)) + { + uint32_t x = uint32_t(std::bit_cast(value)); + uint32_t data_0 = ((data[0]) >> 8) << 8; + data[0] = data_0 | x; + } + else if constexpr (sizeof(T) == sizeof(uint16_t)) + { + uint32_t x = uint32_t(std::bit_cast(value)); + uint32_t data_0 = ((data[0]) >> 16) << 16; + data[0] = data_0 | x; + } + else + { + union + { + uint4::Native_vec_ full_vec; + T value; + } u = {0}; + u.full_vec = data; + u.value = value; + data = u.full_vec; + } + image_store_pck(data, coord, surface); +} + +constexpr auto IMAGE_RESERVED_TOP_BITS = 3; + +static __device__ surface_ptr get_surface_pointer(uint64_t s) +{ + return (surface_ptr)((s << IMAGE_RESERVED_TOP_BITS) >> IMAGE_RESERVED_TOP_BITS); +} + +static __device__ surface_ptr get_surface_pointer(struct textureReference GLOBAL_SPACE *surf_ref) +{ + return (surface_ptr)(surf_ref->textureObject); +} + +static __device__ uint32_t x_coordinate_shift(uint64_t s) +{ + return uint32_t(s >> (64 - IMAGE_RESERVED_TOP_BITS)); +} + +static __device__ uint32_t x_coordinate_shift(struct textureReference GLOBAL_SPACE *ptr) +{ + uint32_t channels = uint32_t(ptr->numChannels); + uint32_t format_width = 0; + hipArray_Format format = ptr->format; + switch (format) + { + case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8: + case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8: + format_width = 1; + break; + case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16: + case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16: + case hipArray_Format::HIP_AD_FORMAT_HALF: + format_width = 2; + break; + case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32: + case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32: + case hipArray_Format::HIP_AD_FORMAT_FLOAT: + format_width = 4; + break; + default: + __builtin_unreachable(); + } + return uint32_t(__builtin_ctz(format_width * channels)); +} + +template +static __device__ T suld_b_zero(Surface surf_arg, typename Coordinates::type coord) +{ + surface_ptr surface = get_surface_pointer(surf_arg); + uint32_t shift_x = x_coordinate_shift(surf_arg); + coord.x = coord.x >> shift_x; + return image_load_pck(coord, surface); +} + +template +static __device__ void sust_b_zero(Surface surf_arg, typename Coordinates::type coord, T data) +{ + surface_ptr surface = get_surface_pointer(surf_arg); + uint32_t shift_x = x_coordinate_shift(surf_arg); + coord.x = coord.x >> shift_x; + if (shift_x <= __builtin_ctz(sizeof(T))) [[likely]] + { + image_store_pck(data, coord, surface); + } + else + { + uint4::Native_vec_ pixel = image_load_pck_full(coord, surface); + image_store_pck_full_with(pixel, data, coord, surface); + } +} + extern "C" { #define atomic_inc(NAME, SUCCESS, FAILURE, SCOPE, SPACE) \ @@ -620,179 +1014,101 @@ extern "C" suld_b_a2d_vec(_v4, b32, uint4); // suld_b_a2d_vec(_v4, b64, ulong4); -#define sust_b_1d_vec(VEC, TYPE, HIP_TYPE) \ - void FUNC(sust_b_1d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, int1::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - TEXTURE_OBJECT_PARAMETERS_INIT; \ - (void)s; \ - int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_1D(i, byte_coord, tmp); \ - } \ - void FUNC(sust_b_indirect_1d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int1::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - surf1Dwrite(hip_data, surfObj, coord.x); \ - } - - sust_b_1d_vec(, b8, uchar1); - sust_b_1d_vec(, b16, ushort1); - sust_b_1d_vec(, b32, uint1); - // sust_b_1d_vec(, b64, ulong1); - sust_b_1d_vec(_v2, b8, uchar2); - sust_b_1d_vec(_v2, b16, ushort2); - sust_b_1d_vec(_v2, b32, uint2); - // sust_b_1d_vec(_v2, b64, ulong2); - sust_b_1d_vec(_v4, b8, uchar4); - sust_b_1d_vec(_v4, b16, ushort4); - sust_b_1d_vec(_v4, b32, uint4); - // sust_b_1d_vec(_v4, b64, ulong4); - -#define sust_b_2d_vec(VEC, TYPE, HIP_TYPE) \ - void FUNC(sust_b_2d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, int2::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - TEXTURE_OBJECT_PARAMETERS_INIT; \ - (void)s; \ - int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_2D(i, int2(byte_coord, coord.y).data, tmp); \ - } \ - void FUNC(sust_b_indirect_2d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int2::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - surf2Dwrite(hip_data, surfObj, coord.x, coord.y); \ - } - - sust_b_2d_vec(, b8, uchar1); - sust_b_2d_vec(, b16, ushort1); - sust_b_2d_vec(, b32, uint1); - // sust_b_2d_vec(, b64, ulong1); - sust_b_2d_vec(_v2, b8, uchar2); - sust_b_2d_vec(_v2, b16, ushort2); - sust_b_2d_vec(_v2, b32, uint2); - // sust_b_2d_vec(_v2, b64, ulong2); - sust_b_2d_vec(_v4, b8, uchar4); - sust_b_2d_vec(_v4, b16, ushort4); - sust_b_2d_vec(_v4, b32, uint4); - // sust_b_2d_vec(_v4, b64, ulong4); - -#define sust_b_3d_vec(VEC, TYPE, HIP_TYPE) \ - void FUNC(sust_b_3d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, int4::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - TEXTURE_OBJECT_PARAMETERS_INIT; \ - (void)s; \ - int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_3D(i, int4(byte_coord, coord.y, coord.z, 0).data, tmp); \ - } \ - void FUNC(sust_b_indirect_3d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int4::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \ - __HIP_SURFACE_OBJECT_PARAMETERS_INIT; \ - int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_3D(i, int4(byte_coord, coord.y, coord.z, 0).data, tmp); \ - } - - sust_b_3d_vec(, b8, uchar1); - sust_b_3d_vec(, b16, ushort1); - sust_b_3d_vec(, b32, uint1); - // sust_b_3d_vec(, b64, ulong1); - sust_b_3d_vec(_v2, b8, uchar2); - sust_b_3d_vec(_v2, b16, ushort2); - sust_b_3d_vec(_v2, b32, uint2); - // sust_b_3d_vec(_v2, b64, ulong2); - sust_b_3d_vec(_v4, b8, uchar4); - sust_b_3d_vec(_v4, b16, ushort4); - sust_b_3d_vec(_v4, b32, uint4); - // sust_b_3d_vec(_v4, b64, ulong4); - -#define sust_b_a1d_vec(VEC, TYPE, HIP_TYPE) \ - void FUNC(sust_b_a1d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, uint layer, int x, HIP_TYPE::Native_vec_ data) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - TEXTURE_OBJECT_PARAMETERS_INIT; \ - (void)s; \ - int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1Da(i), __ockl_image_channel_order_1Da(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_1Da(i, int2(byte_coord, int(layer)).data, tmp); \ - } \ - void FUNC(sust_b_indirect_a1d##VEC##_##TYPE##_trap)(uint64_t serf_arg, uint layer, int x, HIP_TYPE::Native_vec_ data) \ - { \ - hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \ - __HIP_SURFACE_OBJECT_PARAMETERS_INIT; \ - int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1Da(i), __ockl_image_channel_order_1Da(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_1Da(i, int2(byte_coord, int(layer)).data, tmp); \ - } - - sust_b_a1d_vec(, b8, uchar1); - sust_b_a1d_vec(, b16, ushort1); - sust_b_a1d_vec(, b32, uint1); - // sust_b_a1d_vec(, b64, ulong1); - sust_b_a1d_vec(_v2, b8, uchar2); - sust_b_a1d_vec(_v2, b16, ushort2); - sust_b_a1d_vec(_v2, b32, uint2); - // sust_b_a1d_vec(_v2, b64, ulong2); - sust_b_a1d_vec(_v4, b8, uchar4); - sust_b_a1d_vec(_v4, b16, ushort4); - sust_b_a1d_vec(_v4, b32, uint4); - // sust_b_a1d_vec(_v4, b64, ulong4); - -#define sust_b_a2d_vec(VEC, TYPE, HIP_TYPE) \ - void FUNC(sust_b_a2d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, uint layer, int x, int y, HIP_TYPE::Native_vec_ data) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - TEXTURE_OBJECT_PARAMETERS_INIT; \ - (void)s; \ - int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2Da(i), __ockl_image_channel_order_2Da(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_2Da(i, int4(byte_coord, y, int(layer), 0).data, tmp); \ - } \ - void FUNC(sust_b_indirect_a2d##VEC##_##TYPE##_trap)(uint64_t serf_arg, uint layer, int x, int y, HIP_TYPE::Native_vec_ data) \ - { \ - hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \ - __HIP_SURFACE_OBJECT_PARAMETERS_INIT; \ - int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2Da(i), __ockl_image_channel_order_2Da(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_2Da(i, int4(byte_coord, y, int(layer), 0).data, tmp); \ - } - - sust_b_a2d_vec(, b8, uchar1); - sust_b_a2d_vec(, b16, ushort1); - sust_b_a2d_vec(, b32, uint1); - // sust_b_a2d_vec(, b64, ulong1); - sust_b_a2d_vec(_v2, b8, uchar2); - sust_b_a2d_vec(_v2, b16, ushort2); - sust_b_a2d_vec(_v2, b32, uint2); - // sust_b_a2d_vec(_v2, b64, ulong2); - sust_b_a2d_vec(_v4, b8, uchar4); - sust_b_a2d_vec(_v4, b16, ushort4); - sust_b_a2d_vec(_v4, b32, uint4); - // sust_b_a2d_vec(_v4, b64, ulong4); +#define SUST_B_ZERO(TYPE, GEOMETRY, HIP_TYPE) \ + HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_##TYPE##_zero)(uint64_t surf_arg, typename Coordinates::type coord) \ + { \ + return suld_b_zero(surf_arg, coord); \ + } \ + void FUNC(sust_b_indirect_##TYPE##_zero)(uint64_t surf_arg, typename Coordinates::type coord, HIP_TYPE::Native_vec_ data) \ + { \ + sust_b_zero(surf_arg, coord, data); \ + } \ + HIP_TYPE::Native_vec_ FUNC(suld_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, typename Coordinates::type coord) \ + { \ + return suld_b_zero(ptr, coord); \ + } \ + void FUNC(sust_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, typename Coordinates::type coord, HIP_TYPE::Native_vec_ data) \ + { \ + sust_b_zero(ptr, coord, data); \ + } + +#define SUST_B_ZERO_ARRAY(TYPE, GEOMETRY, HIP_TYPE) \ + HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_##TYPE##_zero)(uint64_t surf_arg, uint32_t layer, typename Coordinates::arg_type coord) \ + { \ + auto coord_array = Coordinates::pack_layer(layer, coord); \ + return suld_b_zero(surf_arg, coord_array); \ + } \ + void FUNC(sust_b_indirect_##TYPE##_zero)(uint64_t surf_arg, uint32_t layer, typename Coordinates::arg_type coord, HIP_TYPE::Native_vec_ data) \ + { \ + auto coord_array = Coordinates::pack_layer(layer, coord); \ + sust_b_zero(surf_arg, coord_array, data); \ + } \ + HIP_TYPE::Native_vec_ FUNC(suld_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, typename Coordinates::arg_type coord) \ + { \ + auto coord_array = Coordinates::pack_layer(layer, coord); \ + return suld_b_zero(ptr, coord_array); \ + } \ + void FUNC(sust_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, typename Coordinates::arg_type coord, HIP_TYPE::Native_vec_ data) \ + { \ + auto coord_array = Coordinates::pack_layer(layer, coord); \ + sust_b_zero(ptr, coord_array, data); \ + } + + SUST_B_ZERO(1d_b8, ImageGeometry::_1D, uchar1); + SUST_B_ZERO(1d_b16, ImageGeometry::_1D, ushort1); + SUST_B_ZERO(1d_b32, ImageGeometry::_1D, uint1); + SUST_B_ZERO(1d_b64, ImageGeometry::_1D, ulong1); + SUST_B_ZERO(1d_v2_b8, ImageGeometry::_1D, uchar2); + SUST_B_ZERO(1d_v2_b16, ImageGeometry::_1D, ushort2); + SUST_B_ZERO(1d_v2_b32, ImageGeometry::_1D, uint2); + SUST_B_ZERO(1d_v2_b64, ImageGeometry::_1D, ulong2); + SUST_B_ZERO(1d_v4_b8, ImageGeometry::_1D, uchar4); + SUST_B_ZERO(1d_v4_b16, ImageGeometry::_1D, ushort4); + SUST_B_ZERO(1d_v4_b32, ImageGeometry::_1D, uint4); + SUST_B_ZERO(2d_b8, ImageGeometry::_2D, uchar1); + SUST_B_ZERO(2d_b16, ImageGeometry::_2D, ushort1); + SUST_B_ZERO(2d_b32, ImageGeometry::_2D, uint1); + SUST_B_ZERO(2d_b64, ImageGeometry::_2D, ulong1); + SUST_B_ZERO(2d_v2_b8, ImageGeometry::_2D, uchar2); + SUST_B_ZERO(2d_v2_b16, ImageGeometry::_2D, ushort2); + SUST_B_ZERO(2d_v2_b32, ImageGeometry::_2D, uint2); + SUST_B_ZERO(2d_v2_b64, ImageGeometry::_2D, ulong2); + SUST_B_ZERO(2d_v4_b8, ImageGeometry::_2D, uchar4); + SUST_B_ZERO(2d_v4_b16, ImageGeometry::_2D, ushort4); + SUST_B_ZERO(2d_v4_b32, ImageGeometry::_2D, uint4); + SUST_B_ZERO(3d_b8, ImageGeometry::_3D, uchar1); + SUST_B_ZERO(3d_b16, ImageGeometry::_3D, ushort1); + SUST_B_ZERO(3d_b32, ImageGeometry::_3D, uint1); + SUST_B_ZERO(3d_b64, ImageGeometry::_3D, ulong1); + SUST_B_ZERO(3d_v2_b8, ImageGeometry::_3D, uchar2); + SUST_B_ZERO(3d_v2_b16, ImageGeometry::_3D, ushort2); + SUST_B_ZERO(3d_v2_b32, ImageGeometry::_3D, uint2); + SUST_B_ZERO(3d_v2_b64, ImageGeometry::_3D, ulong2); + SUST_B_ZERO(3d_v4_b8, ImageGeometry::_3D, uchar4); + SUST_B_ZERO(3d_v4_b16, ImageGeometry::_3D, ushort4); + SUST_B_ZERO(3d_v4_b32, ImageGeometry::_3D, uint4); + SUST_B_ZERO_ARRAY(a1d_b8, ImageGeometry::A1D, uchar1); + SUST_B_ZERO_ARRAY(a1d_b16, ImageGeometry::A1D, ushort1); + SUST_B_ZERO_ARRAY(a1d_b32, ImageGeometry::A1D, uint1); + SUST_B_ZERO_ARRAY(a1d_b64, ImageGeometry::A1D, ulong1); + SUST_B_ZERO_ARRAY(a1d_v2_b8, ImageGeometry::A1D, uchar2); + SUST_B_ZERO_ARRAY(a1d_v2_b16, ImageGeometry::A1D, ushort2); + SUST_B_ZERO_ARRAY(a1d_v2_b32, ImageGeometry::A1D, uint2); + SUST_B_ZERO_ARRAY(a1d_v2_b64, ImageGeometry::A1D, ulong2); + SUST_B_ZERO_ARRAY(a1d_v4_b8, ImageGeometry::A1D, uchar4); + SUST_B_ZERO_ARRAY(a1d_v4_b16, ImageGeometry::A1D, ushort4); + SUST_B_ZERO_ARRAY(a1d_v4_b32, ImageGeometry::A1D, uint4); + SUST_B_ZERO_ARRAY(a2d_b8, ImageGeometry::A2D, uchar1); + SUST_B_ZERO_ARRAY(a2d_b16, ImageGeometry::A2D, ushort1); + SUST_B_ZERO_ARRAY(a2d_b32, ImageGeometry::A2D, uint1); + SUST_B_ZERO_ARRAY(a2d_b64, ImageGeometry::A2D, ulong1); + SUST_B_ZERO_ARRAY(a2d_v2_b8, ImageGeometry::A2D, uchar2); + SUST_B_ZERO_ARRAY(a2d_v2_b16, ImageGeometry::A2D, ushort2); + SUST_B_ZERO_ARRAY(a2d_v2_b32, ImageGeometry::A2D, uint2); + SUST_B_ZERO_ARRAY(a2d_v2_b64, ImageGeometry::A2D, ulong2); + SUST_B_ZERO_ARRAY(a2d_v4_b8, ImageGeometry::A2D, uchar4); + SUST_B_ZERO_ARRAY(a2d_v4_b16, ImageGeometry::A2D, ushort4); + SUST_B_ZERO_ARRAY(a2d_v4_b32, ImageGeometry::A2D, uint4); __device__ static inline bool is_upper_warp() { diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 61a74c95..10852583 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -2934,7 +2934,7 @@ fn replace_instructions_with_builtins_impl<'input>( vector, "_", suld.type_.to_ptx_name(), - "_trap", + "_zero", ] .concat(); statements.push(instruction_to_fn_call( @@ -2955,7 +2955,7 @@ fn replace_instructions_with_builtins_impl<'input>( vector, "_", sust.type_.to_ptx_name(), - "_trap", + "_zero", ] .concat(); statements.push(instruction_to_fn_call( diff --git a/zluda/src/cuda.rs b/zluda/src/cuda.rs index 1d054c38..1f37dbfe 100644 --- a/zluda/src/cuda.rs +++ b/zluda/src/cuda.rs @@ -1245,7 +1245,7 @@ mod definitions { pub(crate) unsafe fn cuSurfObjectDestroy( surfObject: hipSurfaceObject_t, ) -> hipError_t { - hipDestroySurfaceObject(surfObject) + surface::destroy(surfObject) } pub(crate) unsafe fn cuTexObjectCreate( diff --git a/zluda/src/impl/surface.rs b/zluda/src/impl/surface.rs index fcf9a52c..0f24fa38 100644 --- a/zluda/src/impl/surface.rs +++ b/zluda/src/impl/surface.rs @@ -1,23 +1,65 @@ +use super::hipfix; +use crate::hip_call_cuda; use cuda_types::*; use hip_runtime_sys::*; use std::{mem, ptr}; -use crate::hip_call_cuda; - -use super::{hipfix, FromCuda}; +// Same as in zluda_ptx_impl.cpp +const IMAGE_RESERVED_TOP_BITS: u32 = 3; pub(crate) unsafe fn create( - p_surf_object: *mut hipSurfaceObject_t, + result: *mut hipSurfaceObject_t, p_res_desc: *const CUDA_RESOURCE_DESC, ) -> Result<(), CUresult> { if p_res_desc == ptr::null() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } let desc = to_surface_desc(*p_res_desc)?; - hip_call_cuda!(hipCreateSurfaceObject(p_surf_object, &desc)); + // We need to check array format and channel count to set top bits of the surface object. + // HIP does not support non-Array sources anyway + if desc.resType != hipResourceType::hipResourceTypeArray { + return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED); + } + let mut surf_obj = mem::zeroed(); + hip_call_cuda!(hipCreateSurfaceObject(&mut surf_obj, &desc)); + let top_reserved_bits = surf_obj as usize >> (usize::BITS - IMAGE_RESERVED_TOP_BITS); + if top_reserved_bits != 0 { + #[allow(unused_must_use)] + { + hipDestroySurfaceObject(surf_obj); + } + return Err(CUresult::CUDA_ERROR_UNKNOWN); + } + let format_size = format_size((&*desc.res.array.array).Format)?; + let channels = (&*desc.res.array.array).NumChannels; + let pixel_size = format_size * channels as usize; + let shift_amount = + (pixel_size.trailing_zeros() as usize) << (usize::BITS - IMAGE_RESERVED_TOP_BITS); + surf_obj = (surf_obj as usize | shift_amount) as _; + *result = surf_obj; Ok(()) } +pub(crate) unsafe fn destroy(surf_object: hipSurfaceObject_t) -> hipError_t { + hipDestroySurfaceObject( + (((surf_object as usize) << IMAGE_RESERVED_TOP_BITS) >> IMAGE_RESERVED_TOP_BITS) as _, + ) +} + +pub(crate) fn format_size(f: hipArray_Format) -> Result { + Ok(match f { + hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8 + | hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => 1, + hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16 + | hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 + | hipArray_Format::HIP_AD_FORMAT_HALF => 2, + hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32 + | hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32 + | hipArray_Format::HIP_AD_FORMAT_FLOAT => 4, + _ => return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), + }) +} + unsafe fn to_surface_desc(res_desc: CUDA_RESOURCE_DESC) -> Result { let res_type = mem::transmute(res_desc.resType); let res: hipResourceDesc__bindgen_ty_1 = match res_desc.resType { @@ -26,92 +68,10 @@ unsafe fn to_surface_desc(res_desc: CUDA_RESOURCE_DESC) -> Result hipResourceDesc__bindgen_ty_1 { - mipmap: hipResourceDesc__bindgen_ty_1__bindgen_ty_2 { - mipmap: mem::transmute(res_desc.res.mipmap.hMipmappedArray), - }, - }, - CUresourcetype::CU_RESOURCE_TYPE_LINEAR => hipResourceDesc__bindgen_ty_1 { - linear: hipResourceDesc__bindgen_ty_1__bindgen_ty_3 { - devPtr: res_desc.res.linear.devPtr.0, - desc: channel_format_desc( - FromCuda::from_cuda(res_desc.res.linear.format), - res_desc.res.linear.numChannels, - )?, - sizeInBytes: res_desc.res.linear.sizeInBytes, - }, - }, - CUresourcetype::CU_RESOURCE_TYPE_PITCH2D => hipResourceDesc__bindgen_ty_1 { - pitch2D: hipResourceDesc__bindgen_ty_1__bindgen_ty_4 { - devPtr: res_desc.res.pitch2D.devPtr.0, - desc: channel_format_desc( - FromCuda::from_cuda(res_desc.res.pitch2D.format), - res_desc.res.pitch2D.numChannels, - )?, - width: res_desc.res.pitch2D.width, - height: res_desc.res.pitch2D.height, - pitchInBytes: res_desc.res.pitch2D.pitchInBytes, - }, - }, - _ => todo!(), + _ => return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), }; Ok(hipResourceDesc { resType: res_type, res, }) } - -fn channel_format_desc( - format: hipArray_Format, - num_channels: u32, -) -> Result { - let mut bits = match num_channels { - 1 => (1, 0, 0, 0), - 2 => (1, 1, 0, 0), - 3 => (1, 1, 1, 0), - 4 => (1, 1, 1, 1), - _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE), - }; - let (kind, bit_width) = match format { - hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8 => { - (hipChannelFormatKind::hipChannelFormatKindUnsigned, u8::BITS) - } - hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16 => ( - hipChannelFormatKind::hipChannelFormatKindUnsigned, - u16::BITS, - ), - hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32 => ( - hipChannelFormatKind::hipChannelFormatKindUnsigned, - u32::BITS, - ), - hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => { - (hipChannelFormatKind::hipChannelFormatKindSigned, i8::BITS) - } - hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 => { - (hipChannelFormatKind::hipChannelFormatKindSigned, i16::BITS) - } - hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32 => { - (hipChannelFormatKind::hipChannelFormatKindSigned, i32::BITS) - } - hipArray_Format::HIP_AD_FORMAT_HALF => ( - hipChannelFormatKind::hipChannelFormatKindFloat, - mem::size_of::() as u32 * u8::BITS, - ), - hipArray_Format::HIP_AD_FORMAT_FLOAT => ( - hipChannelFormatKind::hipChannelFormatKindFloat, - mem::size_of::() as u32 * u8::BITS, - ), - _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE), - }; - bits.0 *= bit_width; - bits.1 *= bit_width; - bits.2 *= bit_width; - bits.3 *= bit_width; - Ok(hipChannelFormatDesc { - x: bits.0 as i32, - y: bits.0 as i32, - z: bits.0 as i32, - w: bits.0 as i32, - f: kind, - }) -} diff --git a/zluda/tests/kernel_suld.rs b/zluda/tests/kernel_suld.rs index ad6e9649..07fc5606 100644 --- a/zluda/tests/kernel_suld.rs +++ b/zluda/tests/kernel_suld.rs @@ -340,10 +340,6 @@ unsafe fn kernel_suld_impl< if mem::size_of::() * CHANNELS < mem::size_of::() * SULD_N { return; } - // TODO: reenable those tests - if mem::size_of::() != mem::size_of::() || CHANNELS != SULD_N { - return; - } let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed); let size = 4usize; let random_size = rand::distributions::Uniform::::new(1, size as u32); diff --git a/zluda/tests/kernel_sust.rs b/zluda/tests/kernel_sust.rs index 831e467d..5057b563 100644 --- a/zluda/tests/kernel_sust.rs +++ b/zluda/tests/kernel_sust.rs @@ -312,7 +312,9 @@ unsafe fn byte_fill(vec: &mut Vec, value: u8) { fn extend_bytes_with(slice: &[u8], elm: u8, desired_length: usize) -> Vec { let mut result = slice.to_vec(); - result.extend(std::iter::repeat(elm).take(desired_length - slice.len())); + if desired_length > slice.len() { + result.extend(std::iter::repeat(elm).take(desired_length - slice.len())); + } result } @@ -337,10 +339,6 @@ unsafe fn kernel_sust_impl< if mem::size_of::() * CHANNELS < mem::size_of::() * SUST_N { return; } - // TODO: reenable those tests - if mem::size_of::() != mem::size_of::() || CHANNELS != SUST_N { - return; - } let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed); let size = 4usize; let random_size = rand::distributions::Uniform::::new(1, size as u32); @@ -461,4 +459,8 @@ unsafe fn kernel_sust_impl< assert_eq!(expected, &*observed); let mut unused = mem::zeroed(); assert_eq!(cuda.cuCtxPopCurrent(&mut unused), CUresult::CUDA_SUCCESS); + assert_eq!( + cuda.cuDevicePrimaryCtxRelease_v2(CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); }