Skip to content

Commit

Permalink
gfx908 optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
IMbackK committed Jun 23, 2024
1 parent 6a2f298 commit 13fe282
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 4 deletions.
2 changes: 1 addition & 1 deletion ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1911,7 +1911,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
const bool fp16_performance_good = min_compute_capability >= CC_GCN4;

#ifdef CUDA_USE_TENSOR_CORES
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
Expand Down
11 changes: 11 additions & 0 deletions ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@
#define CC_TURING 750
#define CC_AMPERE 800
#define CC_OFFSET_AMD 1000000
#define CC_GCN4 (CC_OFFSET_AMD + 803)
#define CC_VEGA (CC_OFFSET_AMD + 900)
#define CC_CDNA (CC_OFFSET_AMD + 908)
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
Expand Down Expand Up @@ -233,6 +236,14 @@ typedef float2 dfloat2;
#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300

#if defined(__gfx908__) || defined(__gfx90a__)
#define CDNA
#endif

#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
#define GCN
#endif

#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
Expand Down
6 changes: 5 additions & 1 deletion ggml-cuda/mmq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,11 @@ static constexpr __device__ int get_mmq_x_max_device() {

static constexpr __device__ int get_mmq_y_device() {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(CDNA) || defined(GCN)
return 32;
#else
return 128;
#endif // defined(CDNA)
#else
#if __CUDA_ARCH__ >= CC_VOLTA
return 128;
Expand Down Expand Up @@ -1972,7 +1976,7 @@ static __device__ void mul_mat_q_process_tile(

template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
#if defined(RDNA3) || defined(RDNA2) || defined(CDNA)
__launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#else
Expand Down
12 changes: 10 additions & 2 deletions ggml-cuda/mmvq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,21 @@ static __global__ void mul_mat_vec_q(

constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);

#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA2) || defined(RDNA3)
constexpr int nwarps = 1;
constexpr int rows_per_cuda_block = 1;
#elif defined(CDNA)
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
constexpr int rows_per_cuda_block = ncols_y == 4 ? ncols_y : 4;
#else
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
#endif
#else
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
const int row0 = rows_per_cuda_block*blockIdx.x;
Expand Down

0 comments on commit 13fe282

Please sign in to comment.