Skip to content

Commit

Permalink
add avx2 for dot_q8_0_q8_0, 2x faster than scalar (#1211)
Browse files Browse the repository at this point in the history
  • Loading branch information
YannFollet authored Apr 28, 2023
1 parent 0b2da20 commit 04aaae1
Showing 1 changed file with 18 additions and 0 deletions.
18 changes: 18 additions & 0 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -3626,6 +3626,24 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
}

*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();

// Main loop
for (int i = 0; i < nb; ++i) {
// Compute combined scale for the block
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
__m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);

const __m256 q = mul_sum_i8_pairs_float(bx, by);

// Multiply q with scale and accumulate
acc = _mm256_fmadd_ps( d, q, acc );
}

*s = hsum_float_8(acc);
#else
// scalar
float sumf = 0.0;
Expand Down

0 comments on commit 04aaae1

Please sign in to comment.