From 5fa47bf6c72c7ccc2c2dd36755c6cbba9c30dbc6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 3 May 2023 23:13:37 +0300
Subject: [PATCH 01/32] ggml : remove Q4_0 bit shufling (ARM NEON)

---
 ggml.c | 468 +++++----------------------------------------------------
 1 file changed, 39 insertions(+), 429 deletions(-)

diff --git a/ggml.c b/ggml.c
index 4e309df8a48e2..f957b786042ce 100644
--- a/ggml.c
+++ b/ggml.c
@@ -837,348 +837,52 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s
 
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
+    assert(QK4_0 / 16 == 0);
     assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
 
-    uint8_t pp[QK4_0/2];
+    const int nb = k / QK4_0;
 
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
-        float max = 0.0f;
+        float max  = 0.0f;
 
         for (int l = 0; l < QK4_0; l++) {
             const float v = x[i*QK4_0 + l];
             if (amax < fabsf(v)) {
                 amax = fabsf(v);
-                max = v;
+                max  = v;
             }
         }
 
-        const float d = max / -8;
+        const float d  = max / -8;
         const float id = d ? 1.0f/d : 0.0f;
 
         y[i].d = d;
 
-        for (int l = 0; l < QK4_0; l += 2) {
-            const float v0 = x[i*QK4_0 + l + 0]*id;
-            const float v1 = x[i*QK4_0 + l + 1]*id;
+        uint64_t qs[QK4_0 / 16] = {0};
 
-            const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8);
-            const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8);
+        // pack first half of weights into low nibbles and second half into high nibbles
+        for (int l = 0; l < QK4_0/2; ++l) {
+            const float v0 = x[i*QK4_0 + 0       + l]*id;
+            const float v1 = x[i*QK4_0 + QK4_0/2 + l]*id;
 
-            assert(vi0 < 16);
-            assert(vi1 < 16);
+            const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f));
+            const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f));
 
-            pp[l/2] = vi0 | (vi1 << 4);
+            qs[l/8] |= vi0 << (8*(l & 7));
+            qs[l/8] |= vi1 << (8*(l & 7) + 4);
         }
 
-        memcpy(y[i].qs, pp, sizeof(pp));
+        memcpy(y[i].qs, qs, sizeof(qs));
     }
 }
 
 static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) {
     assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
 
     block_q4_0 * restrict y = vy;
 
-#if defined(__POWER9_VECTOR__)
-    const vector float v85 = vec_splats(8.5f);
-    const vector signed int v15 = vec_splats(15);
-    for (int i = 0; i < nb; i++) {
-        float max = 0.0f;
-        float min = 0.0f;
-
-        vector float asrcv [8];
-        vector float srcv [8];
-        vector float maxv[8];
-        vector float minv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = *(vector float *)(x + i*32 + 4*l);
-        //for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
-        //for (int l = 0; l < 2; l++) maxv[4*l] = vec_max(maxv[4*l], maxv[4*l+2]);
-        maxv[0] = vec_max(maxv[0], maxv[2]);
-        maxv[4] = vec_max(maxv[4], maxv[6]);
-        //for (int l = 0; l < 1; l++) maxv[8*l] = vec_max(maxv[8*l], maxv[8*l+4]);
-        maxv[0] = vec_max(maxv[0], maxv[4]);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = vec_min(asrcv[2*l], asrcv[2*l+1]);
-        //for (int l = 0; l < 2; l++) minv[4*l] = vec_min(minv[4*l], minv[4*l+2]);
-        minv[0] = vec_min(minv[0], minv[2]);
-        minv[4] = vec_min(minv[4], minv[6]);
-        //for (int l = 0; l < 1; l++) minv[8*l] = vec_min(minv[8*l], minv[8*l+4]);
-        minv[0] = vec_min(minv[0], minv[4]);
-
-
-        max = MAX(
-                MAX(vec_extract(maxv[0], 0), vec_extract(maxv[0], 1)),
-                MAX(vec_extract(maxv[0], 2), vec_extract(maxv[0], 3)));
-        min = MIN(
-                MIN(vec_extract(minv[0], 0), vec_extract(minv[0], 1)),
-                MIN(vec_extract(minv[0], 2), vec_extract(minv[0], 3)));
-
-        const float magnitude = max >= fabsf(min) ? max : min;
-        const float d = magnitude / -8;
-        const float id = d ? 1.0/d : 0.0;
-
-        y[i].d = d;
-
-        const vector float vid = vec_splats(id);
-        uint8_t * restrict pb = y[i].qs;
-        for (int l = 0; l < 8; l++) {
-            const vector float vf  = vec_madd(srcv[l], vid, v85);
-            const vector signed int vi = vec_signed(vf);
-            const vector signed int vc = vec_min(vi, v15);
-
-            pb[2*l + 0] = vec_extract(vc, 0) | (vec_extract(vc, 1) << 4);
-            pb[2*l + 1] = vec_extract(vc, 2) | (vec_extract(vc, 3) << 4);
-        }
-    }
-#elif __ARM_NEON
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t maxv[8];
-        float32x4_t minv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l+2]);
-        for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l+4]);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l+2]);
-        for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l+4]);
-
-        const float max = vmaxvq_f32(maxv[0]);
-        const float min = vminvq_f32(minv[0]);
-
-        const float magnitude = max >= fabsf(min) ? max : min;
-        const float d = magnitude / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
-            const int32x4_t   vi = vcvtq_s32_f32(vf);
-            const int32x4_t   vc = vminq_s32(vi, vdupq_n_s32(15));
-
-            y[i].qs[2*l + 0] = vgetq_lane_s32(vc, 0) | (vgetq_lane_s32(vc, 1) << 4);
-            y[i].qs[2*l + 1] = vgetq_lane_s32(vc, 2) | (vgetq_lane_s32(vc, 3) << 4);
-        }
-    }
-#elif defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max for the block
-        __m256 max  = _mm256_max_ps( v0, v1 );
-        __m256 maxTmp = _mm256_max_ps( v2, v3 );
-        max = _mm256_max_ps( max, maxTmp );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Compute min for the block
-        __m256 min  = _mm256_min_ps( v0, v1 );
-        __m256 minTmp = _mm256_min_ps( v2, v3 );
-        min = _mm256_min_ps( min, minTmp );
-
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) );
-        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
-        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
-        const float minScalar = _mm_cvtss_f32( min4 );
-
-        // Quantize these floats
-        const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar;
-        const float d = magnitude / -8.0f;
-        y[i].d = d;
-        const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
-        const __m256i off = _mm256_set1_epi8( 8 );
-        i0 = _mm256_add_epi8( i0, off );
-        const __m256i maxNibble = _mm256_set1_epi8( 15 );
-        i0 = _mm256_min_epi8( i0, maxNibble );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( i0 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max for the block
-        __m256 max  = _mm256_max_ps( v0, v1 );
-        __m256 maxTmp = _mm256_max_ps( v2, v3 );
-        max = _mm256_max_ps( max, maxTmp );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Compute min for the block
-        __m256 min  = _mm256_min_ps( v0, v1 );
-        __m256 minTmp = _mm256_min_ps( v2, v3 );
-        min = _mm256_min_ps( min, minTmp );
-
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) );
-        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
-        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
-        const float minScalar = _mm_cvtss_f32( min4 );
-
-        // Quantize these floats
-        const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar;
-        const float d = magnitude / -8.0f;
-        y[i].d = d;
-        const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
-        const __m128i off = _mm_set1_epi8( 8 );
-        ni0 = _mm_add_epi8( ni0, off );
-        ni4 = _mm_add_epi8( ni4, off );
-        const __m128i maxNibble = _mm_set1_epi8( 15 );
-        ni0 = _mm_min_epi8( ni0, maxNibble );
-        ni4 = _mm_min_epi8( ni4, maxNibble );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( ni0, ni4 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        float max = 0.0f;
-        float min = 0.0f;
-
-        v128_t srcv [8];
-        v128_t maxv[8];
-        v128_t minv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = wasm_v128_load(x + i*32 + 4*l);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = wasm_f32x4_max(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) maxv[4*l] = wasm_f32x4_max(maxv[4*l], maxv[4*l+2]);
-        for (int l = 0; l < 1; l++) maxv[8*l] = wasm_f32x4_max(maxv[8*l], maxv[8*l+4]);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = wasm_f32x4_min(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) minv[4*l] = wasm_f32x4_min(minv[4*l], minv[4*l+2]);
-        for (int l = 0; l < 1; l++) minv[8*l] = wasm_f32x4_min(minv[8*l], minv[8*l+4]);
-
-        max = MAX(
-                MAX(wasm_f32x4_extract_lane(maxv[0], 0), wasm_f32x4_extract_lane(maxv[0], 1)),
-                MAX(wasm_f32x4_extract_lane(maxv[0], 2), wasm_f32x4_extract_lane(maxv[0], 3)));
-        min = MIN(
-                MIN(wasm_f32x4_extract_lane(minv[0], 0), wasm_f32x4_extract_lane(minv[0], 1)),
-                MIN(wasm_f32x4_extract_lane(minv[0], 2), wasm_f32x4_extract_lane(minv[0], 3)));
-
-        const float magnitude = max >= fabsf(min) ? max : min;
-        const float d = magnitude / -8;
-        const float id = d ? 1.0/d : 0.0;
-
-        y[i].d = d;
-
-        for (int l = 0; l < 8; l++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
-            const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
-            const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
-
-            y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
-            y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
-        }
-    }
-#else
-    // scalar
     quantize_row_q4_0_reference(x, y, k);
-#endif
 }
 
 static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
@@ -1843,121 +1547,33 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
 }
 
 static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
+    assert(QK4_0 / 16 == 0);
     assert(k % QK4_0 == 0);
+
     const int nb = k / QK4_0;
 
     const block_q4_0 * restrict x = vx;
 
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // scale factor
-        const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_0; l += 32) {
-            // Load 32x4-bit integers into 32x8-bit integers
-            __m256i vx8 = bytes_from_nibbles_32(pp+l/2);
-
-            // Subtract 8 from the integers
-            vx8 = _mm256_sub_epi8(vx8, _mm256_set1_epi8(8));
-
-            // Convert to 16-bit int
-            const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
-            const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
-
-            // Convert to 32-bit int -> float 32
-            const __m256 vf[4] = {
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
-            };
-
-            // Scale and store
-            for (int j = 0; j < 4; j++) {
-                const __m256 result = _mm256_mul_ps(vf[j], d_v);
-                _mm256_storeu_ps(y + i * QK4_0 + l + j*8, result);
-            }
-        }
-    }
-#elif defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        const float32x4_t vd = vdupq_n_f32(x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_0; l += 16) {
-            // Load 16x4-bit integers into 8x8-bit integers
-            const uint8x8_t v8 = vld1_u8(pp + l/2);
-
-            // Expand 4-bit qs to 8-bit bytes
-            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F));
-            const uint8x8_t v1 = vshr_n_u8(v8, 4);
-
-            // Convert to signed 8-bit integers
-            const int8x8_t vs_0 = vreinterpret_s8_u8(v0);
-            const int8x8_t vs_1 = vreinterpret_s8_u8(v1);
-
-            // Subtract 8 from each byte
-            const int8x8_t vb_0 = vsub_s8(vs_0, vdup_n_s8(8));
-            const int8x8_t vb_1 = vsub_s8(vs_1, vdup_n_s8(8));
-
-            // Interleave and combine
-            const int8x8_t vx_0 = vzip1_s8(vb_0, vb_1);
-            const int8x8_t vx_1 = vzip2_s8(vb_0, vb_1);
-
-            const int8x16_t vq = vcombine_s8(vx_0, vx_1);
-
-            // convert to 2x int16x8_t
-            const int16x8_t vi_0 = vmovl_s8(vget_low_s8 (vq));
-            const int16x8_t vi_1 = vmovl_s8(vget_high_s8(vq));
-
-            // convert to 4x float32x4_t
-            const float32x4_t vf_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_0)));
-            const float32x4_t vf_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_0)));
-            const float32x4_t vf_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_1)));
-            const float32x4_t vf_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_1)));
-
-            // Multiply by d
-            const float32x4_t r0 = vmulq_f32(vf_0, vd);
-            const float32x4_t r1 = vmulq_f32(vf_1, vd);
-            const float32x4_t r2 = vmulq_f32(vf_2, vd);
-            const float32x4_t r3 = vmulq_f32(vf_3, vd);
-
-            // Store
-            vst1q_f32(y + i*QK4_0 + l +  0, r0);
-            vst1q_f32(y + i*QK4_0 + l +  4, r1);
-            vst1q_f32(y + i*QK4_0 + l +  8, r2);
-            vst1q_f32(y + i*QK4_0 + l + 12, r3);
-        }
-    }
-#else
-    // scalar
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
 
-        const uint8_t * restrict pp = x[i].qs;
+        // unpack nibbles into bytes
+        uint64_t qs[QK4_0 / 8] = {0};
 
-        for (int l = 0; l < QK4_0; l += 2) {
-            const uint8_t vi = pp[l/2];
+        memcpy(qs + 0,          x[i].qs, sizeof(x[i].qs));
+        memcpy(qs + QK4_0 / 16, x[i].qs, sizeof(x[i].qs));
 
-            const int8_t vi0 = vi & 0x0F;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = (vi0 - 8)*d;
-            const float v1 = (vi1 - 8)*d;
-
-            //printf("d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\n", d, vi, vi0, vi1, v0, v1);
+        for (int l = 0; l < QK4_0 / 16; ++l) {
+            qs[l           ] = (qs[l           ] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
+            qs[l + QK4_0/16] = (qs[l + QK4_0/16] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
+        }
 
-            y[i*QK4_0 + l + 0] = v0;
-            y[i*QK4_0 + l + 1] = v1;
+        const uint8_t * restrict qsp = (const uint8_t * restrict) qs;
 
-            assert(!isnan(y[i*QK4_0 + l + 0]));
-            assert(!isnan(y[i*QK4_0 + l + 1]));
+        for (int l = 0; l < QK4_0; ++l) {
+            y[i*QK4_0 + l] = (qsp[l] - 8)*d;
         }
     }
-#endif
 }
 
 static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) {
@@ -2887,12 +2503,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
         const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
 
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
-
         // load y
         const int8x16_t v1_0l = vld1q_s8(y0->qs);
         const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
@@ -2901,21 +2511,21 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 
 #if defined(__ARM_FEATURE_DOTPROD)
         // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
 
         sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
         sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
 #else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
 
         const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
         const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));

From 844d2af89dcfe87129428eacd3d03feba4de8ee6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 4 May 2023 20:53:14 +0300
Subject: [PATCH 02/32] ggml : remove Q4_1 bit shuffling (ARM NEON + reference)

---
 ggml.c | 416 +++++++++++++++------------------------------------------
 1 file changed, 111 insertions(+), 305 deletions(-)

diff --git a/ggml.c b/ggml.c
index f957b786042ce..cb8cea045567b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -837,17 +837,19 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s
 
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
-    assert(QK4_0 / 16 == 0);
-    assert(k % QK4_0 == 0);
+    static const int qk = QK4_0;
 
-    const int nb = k / QK4_0;
+    assert(qk / 16 == 0);
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
         float max  = 0.0f;
 
-        for (int l = 0; l < QK4_0; l++) {
-            const float v = x[i*QK4_0 + l];
+        for (int l = 0; l < qk; l++) {
+            const float v = x[i*qk + l];
             if (amax < fabsf(v)) {
                 amax = fabsf(v);
                 max  = v;
@@ -862,9 +864,9 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
         uint64_t qs[QK4_0 / 16] = {0};
 
         // pack first half of weights into low nibbles and second half into high nibbles
-        for (int l = 0; l < QK4_0/2; ++l) {
-            const float v0 = x[i*QK4_0 + 0       + l]*id;
-            const float v1 = x[i*QK4_0 + QK4_0/2 + l]*id;
+        for (int l = 0; l < qk/2; ++l) {
+            const float v0 = x[i*qk + 0    + l]*id;
+            const float v1 = x[i*qk + qk/2 + l]*id;
 
             const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f));
             const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f));
@@ -877,176 +879,55 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
     }
 }
 
-static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_0 == 0);
-
-    block_q4_0 * restrict y = vy;
-
+static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
     quantize_row_q4_0_reference(x, y, k);
 }
 
-static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_1 == 0);
-    const int nb = k / QK4_1;
+static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
+    const int qk = QK4_1;
 
-    block_q4_1 * restrict y = vy;
+    assert(qk / 16 == 0);
+    assert(k % qk == 0);
 
-    uint8_t pp[QK4_1/2];
+    const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
         float min = FLT_MAX;
         float max = -FLT_MAX;
 
-        for (int l = 0; l < QK4_1; l++) {
-            const float v = x[i*QK4_1 + l];
+        for (int l = 0; l < qk; l++) {
+            const float v = x[i*qk + l];
+
             if (v < min) min = v;
             if (v > max) max = v;
         }
 
-        const float d = (max - min) / ((1 << 4) - 1);
+        const float d  = (max - min) / ((1 << 4) - 1);
         const float id = d ? 1.0f/d : 0.0f;
 
         y[i].d = d;
         y[i].m = min;
 
-        for (int l = 0; l < QK4_1; l += 2) {
-            const float v0 = (x[i*QK4_1 + l + 0] - min)*id;
-            const float v1 = (x[i*QK4_1 + l + 1] - min)*id;
+        uint64_t qs[QK4_1 / 16] = {0};
 
-            const uint8_t vi0 = roundf(v0);
-            const uint8_t vi1 = roundf(v1);
+        // pack first half of weights into low nibbles and second half into high nibbles
+        for (int l = 0; l < qk/2; ++l) {
+            const float v0 = (x[i*qk + 0    + l] - min)*id;
+            const float v1 = (x[i*qk + qk/2 + l] - min)*id;
 
-            assert(vi0 < 16);
-            assert(vi1 < 16);
+            const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f));
+            const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f));
 
-            pp[l/2] = vi0 | (vi1 << 4);
+            qs[l/8] |= vi0 << (8*(l & 7));
+            qs[l/8] |= vi1 << (8*(l & 7) + 4);
         }
 
-        memcpy(y[i].qs, pp, sizeof(pp));
+        memcpy(y[i].qs, qs, sizeof(qs));
     }
 }
 
-static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_1 == 0);
-
-    const int nb = k / QK4_1;
-
-    block_q4_1 * restrict y = vy;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max for the block
-        __m256 vmax;
-        vmax = _mm256_max_ps( v0, v1 );
-        vmax = _mm256_max_ps( vmax, v2 );
-        vmax = _mm256_max_ps( vmax, v3 );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( vmax, 1 ), _mm256_castps256_ps128( vmax ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Compute min for the block
-        __m256 vmin;
-        vmin = _mm256_min_ps( v0, v1 );
-        vmin = _mm256_min_ps( vmin, v2 );
-        vmin = _mm256_min_ps( vmin, v3 );
-
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( vmin, 1 ), _mm256_castps256_ps128( vmin ) );
-        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
-        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
-        const float minScalar = _mm_cvtss_f32( min4 );
-
-        // Quantize these floats
-        const float d = (maxScalar - minScalar) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].m = minScalar;
-        y[i].d = d;
-
-        // x = (x-min)*id
-        const __m256 mul = _mm256_set1_ps( id );
-        const __m256 off = _mm256_set1_ps( minScalar );
-        v0 = _mm256_mul_ps( _mm256_sub_ps( v0, off ), mul );
-        v1 = _mm256_mul_ps( _mm256_sub_ps( v1, off ), mul );
-        v2 = _mm256_mul_ps( _mm256_sub_ps( v2, off ), mul );
-        v3 = _mm256_mul_ps( _mm256_sub_ps( v3, off ), mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( i0 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif __ARM_NEON
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv[8];
-        float32x4_t minv[8];
-        float32x4_t maxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK4_1 + 4*l);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
-        for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
-        for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]);
-        for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]);
-        for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]);
-
-        const float min = vminvq_f32(minv[0]);
-        const float max = vmaxvq_f32(maxv[0]);
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-        y[i].m = min;
-
-        const float32x4_t minv0 = vdupq_n_f32(min);
-
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
-            const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest
-            const int32x4_t   vi = vcvtq_s32_f32(vf);
-
-            y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
-            y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
-        }
-    }
-#else
-    // scalar
-    quantize_row_q4_1_reference(x, vy, k);
-#endif
+static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
+    quantize_row_q4_1_reference(x, y, k);
 }
 
 // reference implementation for deterministic creation of model files
@@ -1546,13 +1427,13 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
 #endif
 }
 
-static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
-    assert(QK4_0 / 16 == 0);
-    assert(k % QK4_0 == 0);
+static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
+    static const int qk = QK4_0;
 
-    const int nb = k / QK4_0;
+    assert(qk / 16 == 0);
+    assert(k % qk == 0);
 
-    const block_q4_0 * restrict x = vx;
+    const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
@@ -1560,126 +1441,49 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
         // unpack nibbles into bytes
         uint64_t qs[QK4_0 / 8] = {0};
 
-        memcpy(qs + 0,          x[i].qs, sizeof(x[i].qs));
-        memcpy(qs + QK4_0 / 16, x[i].qs, sizeof(x[i].qs));
+        memcpy(qs + 0, x[i].qs, sizeof(x[i].qs));
 
-        for (int l = 0; l < QK4_0 / 16; ++l) {
-            qs[l           ] = (qs[l           ] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
-            qs[l + QK4_0/16] = (qs[l + QK4_0/16] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
+        for (int l = 0; l < qk / 16; ++l) {
+            qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
+            qs[l + 0    ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
         }
 
         const uint8_t * restrict qsp = (const uint8_t * restrict) qs;
 
-        for (int l = 0; l < QK4_0; ++l) {
-            y[i*QK4_0 + l] = (qsp[l] - 8)*d;
+        for (int l = 0; l < qk; ++l) {
+            y[i*qk + l] = (qsp[l] - 8)*d;
         }
     }
 }
 
-static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK4_1 == 0);
-    const int nb = k / QK4_1;
-
-    const block_q4_1 * restrict x = vx;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
-        const __m256 d_m = _mm256_broadcast_ss(&x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_1; l += 32) {
-            // Load 32x4-bit integers into 32x8-bit integers
-            __m256i vx8 = bytes_from_nibbles_32(pp+l/2);
+static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
+    static const int qk = QK4_1;
 
-            // Convert to 16-bit int
-            const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
-            const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
+    assert(qk / 16 == 0);
+    assert(k % qk == 0);
 
-            // Convert to 32-bit int -> float 32
-            const __m256 vf[4] = {
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
-            };
-
-            // Scale, add m and store
-            for (int j = 0; j < 4; j++) {
-                const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m);
-                _mm256_storeu_ps(y + i * QK4_1 + l + j*8, result);
-            }
-        }
-    }
-#elif defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        const float32x4_t vd = vdupq_n_f32(x[i].d);
-        const float32x4_t vm = vdupq_n_f32(x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_1; l += 16) {
-            // Load 16x4-bit integers into 8x8-bit integers
-            const uint8x8_t v8 = vld1_u8(pp + l/2);
-
-            // Expand 4-bit qs to 8-bit bytes
-            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F));
-            const uint8x8_t v1 = vshr_n_u8(v8, 4);
+    const int nb = k / qk;
 
-            // Interleave and combine
-            const uint8x8_t vx_0 = vzip1_u8(v0, v1);
-            const uint8x8_t vx_1 = vzip2_u8(v0, v1);
-
-            const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
-
-            // convert to 2x uint16x8_t
-            const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
-            const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
-
-            // convert to 4x float32x4_t
-            const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
-            const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0)));
-            const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1)));
-            const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1)));
-
-            // multiply by d and add m
-            const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd);
-            const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd);
-            const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd);
-            const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd);
-
-            // Store
-            vst1q_f32(y + i*QK4_1 + l +  0, r0);
-            vst1q_f32(y + i*QK4_1 + l +  4, r1);
-            vst1q_f32(y + i*QK4_1 + l +  8, r2);
-            vst1q_f32(y + i*QK4_1 + l + 12, r3);
-        }
-    }
-#else
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
         const float m = x[i].m;
 
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_1; l += 2) {
-            const uint8_t vi = pp[l/2];
+        // unpack nibbles into bytes
+        uint64_t qs[QK4_0 / 8] = {0};
 
-            const int8_t vi0 = vi & 0x0F;
-            const int8_t vi1 = vi >> 4;
+        memcpy(qs + 0, x[i].qs, sizeof(x[i].qs));
 
-            const float v0 = vi0*d + m;
-            const float v1 = vi1*d + m;
+        for (int l = 0; l < qk / 16; ++l) {
+            qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
+            qs[l + 0    ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
+        }
 
-            y[i*QK4_1 + l + 0] = v0;
-            y[i*QK4_1 + l + 1] = v1;
+        const uint8_t * restrict qsp = (const uint8_t * restrict) qs;
 
-            assert(!isnan(y[i*QK4_1 + l + 0]));
-            assert(!isnan(y[i*QK4_1 + l + 1]));
+        for (int l = 0; l < qk; ++l) {
+            y[i*qk + l] = qsp[l]*d + m;
         }
     }
-#endif
 }
 
 static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, int k) {
@@ -1810,7 +1614,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
 
 static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = {
-        .dequantize_row_q         = dequantize_row_q4_0,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_0,
         .quantize_row_q           = quantize_row_q4_0,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
         .quantize_row_q_dot       = quantize_row_q8_0,
@@ -1818,7 +1622,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q4_1] = {
-        .dequantize_row_q         = dequantize_row_q4_1,
+        .dequantize_row_q         = (dequantize_row_q_t)dequantize_row_q4_1,
         .quantize_row_q           = quantize_row_q4_1,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
         .quantize_row_q_dot       = quantize_row_q8_1,
@@ -2467,9 +2271,10 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
 }
 
 static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
+    const int qk = QK8_0;
+    const int nb = n / qk;
 
-    assert(n % QK8_0 == 0);
+    assert(n % qk == 0);
     assert(nb % 2 == 0);
 
     const block_q4_0 * restrict x = vx;
@@ -2604,41 +2409,45 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 #else
     // scalar
     float sumf = 0.0;
+
     for (int i = 0; i < nb; i++) {
-        const float d0 = x[i].d;
-        const float d1 = y[i].d;
+        // unpack nibbles into bytes
+        uint64_t qs[QK8_0 / 8] = {0};
 
-        const uint8_t * restrict p0 = x[i].qs;
-        const  int8_t * restrict p1 = y[i].qs;
+        memcpy(qs + 0, x[i].qs, sizeof(x[i].qs));
 
-        int sumi = 0;
-        for (int j = 0; j < QK8_0/2; j++) {
-            const uint8_t v0 = p0[j];
+        for (int l = 0; l < qk / 16; ++l) {
+            qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
+            qs[l + 0    ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
+        }
 
-            const int i0 = (int8_t) (v0 & 0x0F) - 8;
-            const int i1 = (int8_t) (v0 >>   4) - 8;
+        const uint8_t * restrict px = (const uint8_t * restrict) qs;
+        const  int8_t * restrict py = y[i].qs;
 
-            const int i2 = p1[2*j + 0];
-            const int i3 = p1[2*j + 1];
+        int sumi = 0;
 
-            sumi += i0*i2 + i1*i3;
+        for (int j = 0; j < qk; ++j) {
+            sumi += (px[j] - 8) * py[j];
         }
-        sumf += d0*d1*sumi;
+
+        sumf += (x[i].d*y[i].d)*sumi;
     }
+
     *s = sumf;
 #endif
 }
 
 static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_1;
+    const int qk = QK8_1;
+    const int nb = n / qk;
 
-    assert(n % QK8_1 == 0);
+    assert(n % qk == 0);
     assert(nb % 2 == 0);
 
     const block_q4_1 * restrict x = vx;
     const block_q8_1 * restrict y = vy;
 
-    // TODO: add AVX / WASM SIMD / etc
+    // TODO: add WASM SIMD
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
@@ -2664,12 +2473,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
         const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
         const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
 
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h);
-
         // load y
         const int8x16_t v1_0l = vld1q_s8(y0->qs);
         const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
@@ -2678,21 +2481,21 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
 
 #if defined(__ARM_FEATURE_DOTPROD)
         // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
 
         sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
         sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
 #else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
 
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
 
         const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
         const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
@@ -2738,27 +2541,30 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
 #else
     // scalar
     float sumf = 0.0;
+
     for (int i = 0; i < nb; i++) {
-        const float d0 = x[i].d;
-        const float m0 = x[i].m;
-        const float d1 = y[i].d;
+        // unpack nibbles into bytes
+        uint64_t qs[QK8_1 / 8] = {0};
 
-        const uint8_t * restrict p0 = x[i].qs;
-        const  int8_t * restrict p1 = y[i].qs;
+        memcpy(qs + 0, x[i].qs, sizeof(x[i].qs));
 
-        // TODO: this is very slow ..
-        for (int j = 0; j < QK8_1/2; j++) {
-            const uint8_t v0 = p0[j];
+        for (int l = 0; l < qk / 16; ++l) {
+            qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
+            qs[l + 0    ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
+        }
 
-            const float f0 = d0*(v0 & 0x0F) + m0;
-            const float f1 = d0*(v0 >>   4) + m0;
+        const uint8_t * restrict px = (const uint8_t * restrict) qs;
+        const  int8_t * restrict py = y[i].qs;
 
-            const float f2 = d1*p1[2*j + 0];
-            const float f3 = d1*p1[2*j + 1];
+        int sumi = 0;
 
-            sumf += f0*f2 + f1*f3;
+        for (int j = 0; j < qk; ++j) {
+            sumi += px[j]*py[j];
         }
+
+        sumf += (x[i].d*y[i].d)*sumi + x[i].m*(y[i].s0 + y[i].s1);
     }
+
     *s = sumf;
 #endif
 }
@@ -12707,7 +12513,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK4_0;
 
     for (int j = 0; j < n; j += k) {
-        block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK4_0;
+        block_q4_0 * restrict y = (block_q4_0 *) dst + j/QK4_0;
 
         quantize_row_q4_0_reference(src + j, y, k);
 
@@ -12730,7 +12536,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
     const int nb = k / QK4_1;
 
     for (int j = 0; j < n; j += k) {
-        block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK4_1;
+        block_q4_1 * restrict y = (block_q4_1 *) dst + j/QK4_1;
 
         quantize_row_q4_1_reference(src + j, y, k);
 

From fd2a137fac3a2ca48d7ef16fea2bf12c1f401397 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 4 May 2023 21:51:42 +0300
Subject: [PATCH 03/32] ggml : nibbles_from_floats() + bytes_from_nibbles()
 (ARM NEON)

---
 ggml.c | 131 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 60 insertions(+), 71 deletions(-)

diff --git a/ggml.c b/ggml.c
index cb8cea045567b..f165129eaad45 100644
--- a/ggml.c
+++ b/ggml.c
@@ -615,6 +615,50 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
 
 #if __ARM_NEON
 
+static inline const uint8_t * bytes_from_nibbles_64(const int qk, const uint8_t * qs, uint64_t * qd) {
+    memcpy(qd, qs, qk/2);
+
+    for (int l = 0; l < qk/16; ++l) {
+        qd[l + qk/16] = (qd[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
+        qd[l + 0    ] = (qd[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
+    }
+
+    return (const uint8_t *) qd;
+}
+
+// pack first half of weights into low nibbles and second half into high nibbles
+// use one scaling factor
+static inline void nibbles_from_floats_64_0(const int qk, const float * x, float id, uint8_t * qs, uint64_t * qd) {
+    for (int l = 0; l < qk/2; ++l) {
+        const float v0 = x[0    + l]*id;
+        const float v1 = x[qk/2 + l]*id;
+
+        const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f));
+        const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f));
+
+        qd[l/8] |= vi0 << (8*(l & 7));
+        qd[l/8] |= vi1 << (8*(l & 7) + 4);
+    }
+
+    memcpy(qs, qd, qk/2);
+}
+
+// use offset and scaling factor
+static inline void nibbles_from_floats_64_1(const int qk, const float * x, float id, float min, uint8_t * qs, uint64_t * qd) {
+    for (int l = 0; l < qk/2; ++l) {
+        const float v0 = (x[0    + l] - min)*id;
+        const float v1 = (x[qk/2 + l] - min)*id;
+
+        const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f));
+        const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f));
+
+        qd[l/8] |= vi0 << (8*(l & 7));
+        qd[l/8] |= vi1 << (8*(l & 7) + 4);
+    }
+
+    memcpy(qs, qd, qk/2);
+}
+
 #if !defined(__aarch64__)
 
 inline static uint16_t vaddvq_u8(uint8x16_t v) {
@@ -863,19 +907,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
 
         uint64_t qs[QK4_0 / 16] = {0};
 
-        // pack first half of weights into low nibbles and second half into high nibbles
-        for (int l = 0; l < qk/2; ++l) {
-            const float v0 = x[i*qk + 0    + l]*id;
-            const float v1 = x[i*qk + qk/2 + l]*id;
-
-            const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f));
-            const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f));
-
-            qs[l/8] |= vi0 << (8*(l & 7));
-            qs[l/8] |= vi1 << (8*(l & 7) + 4);
-        }
-
-        memcpy(y[i].qs, qs, sizeof(qs));
+        nibbles_from_floats_64_0(qk, x + i*qk, id, y[i].qs, qs);
     }
 }
 
@@ -910,19 +942,7 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
 
         uint64_t qs[QK4_1 / 16] = {0};
 
-        // pack first half of weights into low nibbles and second half into high nibbles
-        for (int l = 0; l < qk/2; ++l) {
-            const float v0 = (x[i*qk + 0    + l] - min)*id;
-            const float v1 = (x[i*qk + qk/2 + l] - min)*id;
-
-            const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f));
-            const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f));
-
-            qs[l/8] |= vi0 << (8*(l & 7));
-            qs[l/8] |= vi1 << (8*(l & 7) + 4);
-        }
-
-        memcpy(y[i].qs, qs, sizeof(qs));
+        nibbles_from_floats_64_1(qk, x + i*qk, id, min, y[i].qs, qs);
     }
 }
 
@@ -1435,20 +1455,12 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
 
     const int nb = k / qk;
 
+    uint64_t qs[QK4_0 / 8];
+
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
 
-        // unpack nibbles into bytes
-        uint64_t qs[QK4_0 / 8] = {0};
-
-        memcpy(qs + 0, x[i].qs, sizeof(x[i].qs));
-
-        for (int l = 0; l < qk / 16; ++l) {
-            qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
-            qs[l + 0    ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
-        }
-
-        const uint8_t * restrict qsp = (const uint8_t * restrict) qs;
+        const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs);
 
         for (int l = 0; l < qk; ++l) {
             y[i*qk + l] = (qsp[l] - 8)*d;
@@ -1464,21 +1476,13 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
 
     const int nb = k / qk;
 
+    uint64_t qs[QK4_0 / 8];
+
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
         const float m = x[i].m;
 
-        // unpack nibbles into bytes
-        uint64_t qs[QK4_0 / 8] = {0};
-
-        memcpy(qs + 0, x[i].qs, sizeof(x[i].qs));
-
-        for (int l = 0; l < qk / 16; ++l) {
-            qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
-            qs[l + 0    ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
-        }
-
-        const uint8_t * restrict qsp = (const uint8_t * restrict) qs;
+        const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs);
 
         for (int l = 0; l < qk; ++l) {
             y[i*qk + l] = qsp[l]*d + m;
@@ -2410,19 +2414,12 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
     // scalar
     float sumf = 0.0;
 
+    uint64_t qs[QK8_0 / 8];
+
     for (int i = 0; i < nb; i++) {
         // unpack nibbles into bytes
-        uint64_t qs[QK8_0 / 8] = {0};
-
-        memcpy(qs + 0, x[i].qs, sizeof(x[i].qs));
-
-        for (int l = 0; l < qk / 16; ++l) {
-            qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
-            qs[l + 0    ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
-        }
-
-        const uint8_t * restrict px = (const uint8_t * restrict) qs;
-        const  int8_t * restrict py = y[i].qs;
+        const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs);
+        const  int8_t * py = y[i].qs;
 
         int sumi = 0;
 
@@ -2542,19 +2539,11 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
     // scalar
     float sumf = 0.0;
 
-    for (int i = 0; i < nb; i++) {
-        // unpack nibbles into bytes
-        uint64_t qs[QK8_1 / 8] = {0};
-
-        memcpy(qs + 0, x[i].qs, sizeof(x[i].qs));
+    uint64_t qs[QK8_1 / 8];
 
-        for (int l = 0; l < qk / 16; ++l) {
-            qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
-            qs[l + 0    ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
-        }
-
-        const uint8_t * restrict px = (const uint8_t * restrict) qs;
-        const  int8_t * restrict py = y[i].qs;
+    for (int i = 0; i < nb; i++) {
+        const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs);
+        const  int8_t * py = y[i].qs;
 
         int sumi = 0;
 

From 9f3285f74153ad50c6eef1b47a5650888ebd9e27 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 4 May 2023 22:07:40 +0300
Subject: [PATCH 04/32] ggml : remove Q4_2 bit shuffling (WIP, BROKEN)

---
 ggml.c | 117 +++++++++++++++++++++++----------------------------------
 1 file changed, 47 insertions(+), 70 deletions(-)

diff --git a/ggml.c b/ggml.c
index f165129eaad45..7a9a500cd987c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -884,7 +884,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
     static const int qk = QK4_0;
 
     assert(qk / 16 == 0);
-    assert(k % qk == 0);
+    assert( k % qk == 0);
 
     const int nb = k / qk;
 
@@ -919,7 +919,7 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
     const int qk = QK4_1;
 
     assert(qk / 16 == 0);
-    assert(k % qk == 0);
+    assert( k % qk == 0);
 
     const int nb = k / qk;
 
@@ -952,48 +952,37 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k
 
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) {
-    assert(k % QK4_2 == 0);
+    static const int qk = QK4_2;
 
-    const int nb = k / QK4_2;
+    assert(qk / 16 == 0);
+    assert( k % qk == 0);
+
+    const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
-        float max = 0.0f;
+        float max  = 0.0f;
 
-        for (int l = 0; l < QK4_2; l++) {
-            const float v = x[i*QK4_2 + l];
+        for (int l = 0; l < qk; l++) {
+            const float v = x[i*qk + l];
             if (amax < fabsf(v)) {
                 amax = fabsf(v);
-                max = v;
+                max  = v;
             }
         }
 
-        const float d = max / -8;
-
+        const float d  = max / -8;
         const float id = d ? 1.0f/d : 0.0f;
 
         y[i].d = GGML_FP32_TO_FP16(d);
 
-        for (int l = 0; l < QK4_2; l += 2) {
-            const float v0 = x[i*QK4_2 + l + 0]*id;
-            const float v1 = x[i*QK4_2 + l + 1]*id;
-
-            const uint8_t vi0 = MIN(15, (uint8_t)(v0 + 8.5f));
-            const uint8_t vi1 = MIN(15, (uint8_t)(v1 + 8.5f));
+        uint64_t qs[QK4_2 / 16] = {0};
 
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            y[i].qs[l/2] = vi0 | (vi1 << 4);
-        }
+        nibbles_from_floats_64_0(qk, x + i*qk, id, y[i].qs, qs);
     }
 }
 
-static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_2 == 0);
-
-    block_q4_2 * restrict y = vy;
-
+static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k) {
     quantize_row_q4_2_reference(x, y, k);
 }
 
@@ -1451,7 +1440,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
     static const int qk = QK4_0;
 
     assert(qk / 16 == 0);
-    assert(k % qk == 0);
+    assert( k % qk == 0);
 
     const int nb = k / qk;
 
@@ -1472,7 +1461,7 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
     static const int qk = QK4_1;
 
     assert(qk / 16 == 0);
-    assert(k % qk == 0);
+    assert( k % qk == 0);
 
     const int nb = k / qk;
 
@@ -1490,31 +1479,23 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
     }
 }
 
-static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK4_2 == 0);
-    const int nb = k / QK4_2;
-
-    const block_q4_2 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
+static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict y, int k) {
+    static const int qk = QK4_2;
 
-        const uint8_t * restrict pp = x[i].qs;
+    assert(qk / 16 == 0);
+    assert( k % qk == 0);
 
-        for (int l = 0; l < QK4_2; l += 2) {
-            const uint8_t vi = pp[l/2];
+    const int nb = k / qk;
 
-            const int8_t vi0 = vi & 0x0F;
-            const int8_t vi1 = vi >> 4;
+    uint64_t qs[QK4_2 / 8];
 
-            const float v0 = (vi0 - 8)*d;
-            const float v1 = (vi1 - 8)*d;
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
 
-            y[i*QK4_2 + l + 0] = v0;
-            y[i*QK4_2 + l + 1] = v1;
+        const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs);
 
-            assert(!isnan(y[i*QK4_2 + l + 0]));
-            assert(!isnan(y[i*QK4_2 + l + 1]));
+        for (int l = 0; l < qk; ++l) {
+            y[i*qk + l] = (qsp[l] - 8)*d;
         }
     }
 }
@@ -1634,7 +1615,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
     [GGML_TYPE_Q4_2] = {
-        .dequantize_row_q         = dequantize_row_q4_2,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_2,
         .quantize_row_q           = quantize_row_q4_2,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference,
         .quantize_row_q_dot       = quantize_row_q8_0,
@@ -2559,11 +2540,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
 }
 
 static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
+    const int qk = QK8_0;
+    const int nb = n / qk;
 
-    assert(n % QK8_0 == 0);
+    assert(n % qk == 0);
     assert(nb % 2 == 0);
-    assert(QK8_0 == 2*QK4_2);
+
+    assert(qk == 2*QK4_2);
 
     const block_q4_2 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
@@ -2599,12 +2582,6 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
         const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
         const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
 
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
-
         // load y
         const int8x16_t v1_0l = vld1q_s8(y0->qs);
         const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
@@ -2613,22 +2590,22 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
 
 #if defined(__ARM_FEATURE_DOTPROD)
         sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
+                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)),
+                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hs, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
 
         sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
+                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)),
+                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hs, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
 #else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
 
         const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
         const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));

From aa78dfed7df3f33642e5c9a459c8be097d43aa9f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 4 May 2023 22:55:10 +0300
Subject: [PATCH 05/32] ggml : remove Q5_0 bit shuffling (ARM NEON)

---
 ggml.c | 198 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 91 insertions(+), 107 deletions(-)

diff --git a/ggml.c b/ggml.c
index 7a9a500cd987c..2a25afbbd1275 100644
--- a/ggml.c
+++ b/ggml.c
@@ -626,39 +626,6 @@ static inline const uint8_t * bytes_from_nibbles_64(const int qk, const uint8_t
     return (const uint8_t *) qd;
 }
 
-// pack first half of weights into low nibbles and second half into high nibbles
-// use one scaling factor
-static inline void nibbles_from_floats_64_0(const int qk, const float * x, float id, uint8_t * qs, uint64_t * qd) {
-    for (int l = 0; l < qk/2; ++l) {
-        const float v0 = x[0    + l]*id;
-        const float v1 = x[qk/2 + l]*id;
-
-        const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f));
-        const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f));
-
-        qd[l/8] |= vi0 << (8*(l & 7));
-        qd[l/8] |= vi1 << (8*(l & 7) + 4);
-    }
-
-    memcpy(qs, qd, qk/2);
-}
-
-// use offset and scaling factor
-static inline void nibbles_from_floats_64_1(const int qk, const float * x, float id, float min, uint8_t * qs, uint64_t * qd) {
-    for (int l = 0; l < qk/2; ++l) {
-        const float v0 = (x[0    + l] - min)*id;
-        const float v1 = (x[qk/2 + l] - min)*id;
-
-        const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f));
-        const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f));
-
-        qd[l/8] |= vi0 << (8*(l & 7));
-        qd[l/8] |= vi1 << (8*(l & 7) + 4);
-    }
-
-    memcpy(qs, qd, qk/2);
-}
-
 #if !defined(__aarch64__)
 
 inline static uint16_t vaddvq_u8(uint8x16_t v) {
@@ -907,7 +874,18 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
 
         uint64_t qs[QK4_0 / 16] = {0};
 
-        nibbles_from_floats_64_0(qk, x + i*qk, id, y[i].qs, qs);
+        for (int l = 0; l < qk/2; ++l) {
+            const float v0 = x[i*qk + 0    + l]*id;
+            const float v1 = x[i*qk + qk/2 + l]*id;
+
+            const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f));
+            const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f));
+
+            qs[l/8] |= vi0 << (8*(l & 7));
+            qs[l/8] |= vi1 << (8*(l & 7) + 4);
+        }
+
+        memcpy(y[i].qs, qs, qk/2);
     }
 }
 
@@ -942,7 +920,18 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
 
         uint64_t qs[QK4_1 / 16] = {0};
 
-        nibbles_from_floats_64_1(qk, x + i*qk, id, min, y[i].qs, qs);
+        for (int l = 0; l < qk/2; ++l) {
+            const float v0 = (x[0    + l] - min)*id;
+            const float v1 = (x[qk/2 + l] - min)*id;
+
+            const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f));
+            const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f));
+
+            qs[l/8] |= vi0 << (8*(l & 7));
+            qs[l/8] |= vi1 << (8*(l & 7) + 4);
+        }
+
+        memcpy(y[i].qs, qs, qk/2);
     }
 }
 
@@ -978,7 +967,18 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
 
         uint64_t qs[QK4_2 / 16] = {0};
 
-        nibbles_from_floats_64_0(qk, x + i*qk, id, y[i].qs, qs);
+        for (int l = 0; l < qk/2; ++l) {
+            const float v0 = x[i*qk + 0    + l]*id;
+            const float v1 = x[i*qk + qk/2 + l]*id;
+
+            const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f));
+            const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f));
+
+            qs[l/8] |= vi0 << (8*(l & 7));
+            qs[l/8] |= vi1 << (8*(l & 7) + 4);
+        }
+
+        memcpy(y[i].qs, qs, qk/2);
     }
 }
 
@@ -987,51 +987,54 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k
 }
 
 static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
-    assert(k % QK5_0 == 0);
-    const int nb = k / QK5_0;
+    static const int qk = QK5_0;
+
+    assert(qk / 16 == 0);
+    assert( k % qk == 0);
+
+    const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
-        float max = 0.0f;
+        float max  = 0.0f;
 
-        for (int l = 0; l < QK5_0; l++) {
-            const float v = x[i*QK5_0 + l];
+        for (int l = 0; l < qk; l++) {
+            const float v = x[i*qk + l];
             if (amax < fabsf(v)) {
                 amax = fabsf(v);
-                max = v;
+                max  = v;
             }
         }
 
-        const float d = max / -16;
+        const float d  = max / -16;
         const float id = d ? 1.0f/d : 0.0f;
 
-        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].d = d;
 
         uint32_t qh = 0;
+        uint64_t qs[QK5_0 / 16] = {0};
 
-        for (int l = 0; l < QK5_0; l += 2) {
-            const float v0 = x[i*QK5_0 + l + 0]*id;
-            const float v1 = x[i*QK5_0 + l + 1]*id;
+        for (int l = 0; l < qk/2; ++l) {
+            const float v0 = x[i*qk + 0    + l]*id;
+            const float v1 = x[i*qk + qk/2 + l]*id;
 
-            const uint32_t vi0 = MIN(31, (int) (v0 + 16.5f));
-            const uint32_t vi1 = MIN(31, (int) (v1 + 16.5f));
+            const uint64_t vi0 = MIN(31, (int8_t)(v0 + 16.5f));
+            const uint64_t vi1 = MIN(31, (int8_t)(v1 + 16.5f));
 
-            y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
+            qs[l/8] |= vi0 << (8*(l & 7));
+            qs[l/8] |= vi1 << (8*(l & 7) + 4);
 
             // get the 5-th bit and store it in qh at the right position
             qh |= ((vi0 & 0x10) >> 4) << (l + 0);
-            qh |= ((vi1 & 0x10) >> 4) << (l + 1);
+            qh |= ((vi1 & 0x10) >> 4) << (l + qk/2);
         }
 
-        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
+        memcpy( y[i].qs,  qs, qk/2);
+        memcpy(&y[i].qh, &qh, sizeof(qh));
     }
 }
 
-static void quantize_row_q5_0(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK5_0 == 0);
-
-    block_q5_0 * restrict y = vy;
-
+static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) {
     quantize_row_q5_0_reference(x, y, k);
 }
 
@@ -1500,38 +1503,28 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict
     }
 }
 
-static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK5_0 == 0);
-    const int nb = k / QK5_0;
+static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
+    static const int qk = QK4_0;
 
-    const block_q5_0 * restrict x = vx;
+    assert(qk / 16 == 0);
+    assert( k % qk == 0);
+
+    const int nb = k / qk;
+
+    uint64_t qs[QK5_0 / 8];
 
     for (int i = 0; i < nb; i++) {
         const float d = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict pp = x[i].qs;
-
         uint32_t qh;
         memcpy(&qh, x[i].qh, sizeof(qh));
 
-        for (int l = 0; l < QK5_0; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            // extract the 5-th bit from qh
-            const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-            const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
-
-            const int8_t vi0 = (vi & 0x0F) | vh0;
-            const int8_t vi1 = (vi >>   4) | vh1;
-
-            const float v0 = (vi0 - 16)*d;
-            const float v1 = (vi1 - 16)*d;
+        const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs);
 
-            y[i*QK5_0 + l + 0] = v0;
-            y[i*QK5_0 + l + 1] = v1;
+        for (int l = 0; l < qk; ++l) {
+            const uint8_t vh = ((qh & (1u << l)) >> l) << 4;
 
-            assert(!isnan(y[i*QK5_0 + l + 0]));
-            assert(!isnan(y[i*QK5_0 + l + 1]));
+            y[i*qk + l] = ((qsp[l] | vh) - 16)*d;
         }
     }
 }
@@ -1623,7 +1616,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q5_0] = {
-        .dequantize_row_q         = dequantize_row_q5_0,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q5_0,
         .quantize_row_q           = quantize_row_q5_0,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference,
         .quantize_row_q_dot       = quantize_row_q8_0,
@@ -2693,11 +2686,12 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
 }
 
 static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
+    const int qk = QK8_0;
+    const int nb = n / qk;
 
-    assert(n % QK8_0 == 0);
+    assert(n % qk == 0);
     assert(nb % 2 == 0);
-    assert(QK8_0 == QK5_0);
+    assert(qk == QK5_0);
 
     const block_q5_0 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
@@ -2732,13 +2726,9 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8  (v0, m4b));
         const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
 
-        // interleave
-        const int8x16_t v0lz = vzip1q_s8(v0l, v0h);
-        const int8x16_t v0hz = vzip2q_s8(v0l, v0h);
-
         // add high bit and sub 16
-        const int8x16_t v0lf = vsubq_s8(vorrq_s8(v0lz, qhl), s16b);
-        const int8x16_t v0hf = vsubq_s8(vorrq_s8(v0hz, qhh), s16b);
+        const int8x16_t v0lf = vsubq_s8(vorrq_s8(v0l, qhl), s16b);
+        const int8x16_t v0hf = vsubq_s8(vorrq_s8(v0h, qhh), s16b);
 
         // load y
         const int8x16_t v1l = vld1q_s8(y0->qs);
@@ -2856,34 +2846,28 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
 #else
     // scalar
     float sumf = 0.0;
+
+    uint64_t qs[QK8_0 / 8];
+
     for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[i].qs;
-        const  int8_t * restrict y0 = y[i].qs;
+        // unpack nibbles into bytes
+        const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs);
+        const  int8_t * py = y[i].qs;
 
         uint32_t qh;
         memcpy(&qh, x[i].qh, sizeof(qh));
 
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        int sxy = 0;
-
-        for (int j = 0; j < QK8_0/2; j++) {
-            const uint8_t v0 = x0[j];
-
-            const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
-            const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
-
-            const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
-            const int x1_0 = ((v0 >>   4) | x1_0h) - 16;
+        int sumi = 0;
 
-            const int y0_0 = y0[2*j + 0];
-            const int y1_0 = y0[2*j + 1];
+        for (int j = 0; j < qk; ++j) {
+            const int xh = ((qh & (1u << j)) >> j) << 4;
 
-            sxy += x0_0*y0_0 + x1_0*y1_0;
+            sumi += ((px[j] | xh) - 16)*py[j];
         }
 
-        sumf += (d*sxy)*y[i].d;
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
     }
+
     *s = sumf;
 #endif
 }

From b37a08f6468c454110a3e844fe0978672c213df9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 4 May 2023 23:31:35 +0300
Subject: [PATCH 06/32] ggml : 2x faster scalar implementations

---
 ggml.c | 132 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 67 insertions(+), 65 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2a25afbbd1275..98e7a1fdd6894 100644
--- a/ggml.c
+++ b/ggml.c
@@ -615,7 +615,8 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
 
 #if __ARM_NEON
 
-static inline const uint8_t * bytes_from_nibbles_64(const int qk, const uint8_t * qs, uint64_t * qd) {
+// TODO: obosolete - will be removed
+static inline const uint8_t * b4_from_nibbles_64(const int qk, const uint8_t * qs, uint64_t * qd) {
     memcpy(qd, qs, qk/2);
 
     for (int l = 0; l < qk/16; ++l) {
@@ -875,14 +876,14 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
         uint64_t qs[QK4_0 / 16] = {0};
 
         for (int l = 0; l < qk/2; ++l) {
-            const float v0 = x[i*qk + 0    + l]*id;
-            const float v1 = x[i*qk + qk/2 + l]*id;
+            const float x0 = x[i*qk + 0    + l]*id;
+            const float x1 = x[i*qk + qk/2 + l]*id;
 
-            const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f));
-            const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f));
+            const uint64_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint64_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
 
-            qs[l/8] |= vi0 << (8*(l & 7));
-            qs[l/8] |= vi1 << (8*(l & 7) + 4);
+            qs[l/8] |= xi0 << (8*(l & 7));
+            qs[l/8] |= xi1 << (8*(l & 7) + 4);
         }
 
         memcpy(y[i].qs, qs, qk/2);
@@ -921,14 +922,14 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
         uint64_t qs[QK4_1 / 16] = {0};
 
         for (int l = 0; l < qk/2; ++l) {
-            const float v0 = (x[0    + l] - min)*id;
-            const float v1 = (x[qk/2 + l] - min)*id;
+            const float x0 = (x[0    + l] - min)*id;
+            const float x1 = (x[qk/2 + l] - min)*id;
 
-            const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f));
-            const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f));
+            const uint64_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint64_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
 
-            qs[l/8] |= vi0 << (8*(l & 7));
-            qs[l/8] |= vi1 << (8*(l & 7) + 4);
+            qs[l/8] |= xi0 << (8*(l & 7));
+            qs[l/8] |= xi1 << (8*(l & 7) + 4);
         }
 
         memcpy(y[i].qs, qs, qk/2);
@@ -968,14 +969,14 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
         uint64_t qs[QK4_2 / 16] = {0};
 
         for (int l = 0; l < qk/2; ++l) {
-            const float v0 = x[i*qk + 0    + l]*id;
-            const float v1 = x[i*qk + qk/2 + l]*id;
+            const float x0 = x[i*qk + 0    + l]*id;
+            const float x1 = x[i*qk + qk/2 + l]*id;
 
-            const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f));
-            const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f));
+            const uint64_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint64_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
 
-            qs[l/8] |= vi0 << (8*(l & 7));
-            qs[l/8] |= vi1 << (8*(l & 7) + 4);
+            qs[l/8] |= xi0 << (8*(l & 7));
+            qs[l/8] |= xi1 << (8*(l & 7) + 4);
         }
 
         memcpy(y[i].qs, qs, qk/2);
@@ -1015,18 +1016,18 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
         uint64_t qs[QK5_0 / 16] = {0};
 
         for (int l = 0; l < qk/2; ++l) {
-            const float v0 = x[i*qk + 0    + l]*id;
-            const float v1 = x[i*qk + qk/2 + l]*id;
+            const float x0 = x[i*qk + 0    + l]*id;
+            const float x1 = x[i*qk + qk/2 + l]*id;
 
-            const uint64_t vi0 = MIN(31, (int8_t)(v0 + 16.5f));
-            const uint64_t vi1 = MIN(31, (int8_t)(v1 + 16.5f));
+            const uint64_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
+            const uint64_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
 
-            qs[l/8] |= vi0 << (8*(l & 7));
-            qs[l/8] |= vi1 << (8*(l & 7) + 4);
+            qs[l/8] |= xi0 << (8*(l & 7));
+            qs[l/8] |= xi1 << (8*(l & 7) + 4);
 
             // get the 5-th bit and store it in qh at the right position
-            qh |= ((vi0 & 0x10) >> 4) << (l + 0);
-            qh |= ((vi1 & 0x10) >> 4) << (l + qk/2);
+            qh |= ((xi0 & 0x10) >> 4) << (l + 0);
+            qh |= ((xi1 & 0x10) >> 4) << (l + qk/2);
         }
 
         memcpy( y[i].qs,  qs, qk/2);
@@ -1447,15 +1448,15 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
 
     const int nb = k / qk;
 
-    uint64_t qs[QK4_0 / 8];
-
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
 
-        const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs);
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0xf) - 8;
+            const int x1 = (x[i].qs[j] >>  4) - 8;
 
-        for (int l = 0; l < qk; ++l) {
-            y[i*qk + l] = (qsp[l] - 8)*d;
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
         }
     }
 }
@@ -1468,21 +1469,22 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
 
     const int nb = k / qk;
 
-    uint64_t qs[QK4_0 / 8];
-
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
         const float m = x[i].m;
 
-        const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs);
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0xf);
+            const int x1 = (x[i].qs[j] >>  4);
 
-        for (int l = 0; l < qk; ++l) {
-            y[i*qk + l] = qsp[l]*d + m;
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
         }
     }
 }
 
 static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict y, int k) {
+    // BORKEN !!!
     static const int qk = QK4_2;
 
     assert(qk / 16 == 0);
@@ -1495,7 +1497,7 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict
     for (int i = 0; i < nb; i++) {
         const float d = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs);
+        const uint8_t * qsp = b4_from_nibbles_64(qk, x[i].qs, qs);
 
         for (int l = 0; l < qk; ++l) {
             y[i*qk + l] = (qsp[l] - 8)*d;
@@ -1511,20 +1513,21 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
 
     const int nb = k / qk;
 
-    uint64_t qs[QK5_0 / 8];
-
     for (int i = 0; i < nb; i++) {
         const float d = GGML_FP16_TO_FP32(x[i].d);
 
         uint32_t qh;
         memcpy(&qh, x[i].qh, sizeof(qh));
 
-        const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs);
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-        for (int l = 0; l < qk; ++l) {
-            const uint8_t vh = ((qh & (1u << l)) >> l) << 4;
+            const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
 
-            y[i*qk + l] = ((qsp[l] | vh) - 16)*d;
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
         }
     }
 }
@@ -2388,17 +2391,16 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
     // scalar
     float sumf = 0.0;
 
-    uint64_t qs[QK8_0 / 8];
-
     for (int i = 0; i < nb; i++) {
-        // unpack nibbles into bytes
-        const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs);
-        const  int8_t * py = y[i].qs;
+        const int8_t * py = y[i].qs;
 
         int sumi = 0;
 
-        for (int j = 0; j < qk; ++j) {
-            sumi += (px[j] - 8) * py[j];
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[i].qs[j] & 0xf) - 8;
+            const int v1 = (x[i].qs[j] >>  4) - 8;
+
+            sumi += (v0 * py[j]) + (v1 * py[j + qk/2]);
         }
 
         sumf += (x[i].d*y[i].d)*sumi;
@@ -2513,16 +2515,16 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
     // scalar
     float sumf = 0.0;
 
-    uint64_t qs[QK8_1 / 8];
-
     for (int i = 0; i < nb; i++) {
-        const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs);
-        const  int8_t * py = y[i].qs;
+        const int8_t * py = y[i].qs;
 
         int sumi = 0;
 
-        for (int j = 0; j < qk; ++j) {
-            sumi += px[j]*py[j];
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[i].qs[j] & 0xf);
+            const int v1 = (x[i].qs[j] >>  4);
+
+            sumi += (v0 * py[j]) + (v1 * py[j + qk/2]);
         }
 
         sumf += (x[i].d*y[i].d)*sumi + x[i].m*(y[i].s0 + y[i].s1);
@@ -2847,22 +2849,22 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
     // scalar
     float sumf = 0.0;
 
-    uint64_t qs[QK8_0 / 8];
-
     for (int i = 0; i < nb; i++) {
-        // unpack nibbles into bytes
-        const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs);
-        const  int8_t * py = y[i].qs;
+        const int8_t * py = y[i].qs;
 
         uint32_t qh;
         memcpy(&qh, x[i].qh, sizeof(qh));
 
         int sumi = 0;
 
-        for (int j = 0; j < qk; ++j) {
-            const int xh = ((qh & (1u << j)) >> j) << 4;
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
 
-            sumi += ((px[j] | xh) - 16)*py[j];
+            sumi += (x0 * py[j]) + (x1 * py[j + qk/2]);
         }
 
         sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;

From 292a778ca2246c385047c752fe1e0fa5c9c564e3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 5 May 2023 17:09:11 +0300
Subject: [PATCH 07/32] ggml : remove Q5_1 bit shuffling (ARM NEON + scalar)

---
 ggml.c | 172 ++++++++++++++++++++++-----------------------------------
 1 file changed, 66 insertions(+), 106 deletions(-)

diff --git a/ggml.c b/ggml.c
index 98e7a1fdd6894..cb321455b4dd2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -851,8 +851,7 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
     static const int qk = QK4_0;
 
-    assert(qk / 16 == 0);
-    assert( k % qk == 0);
+    assert(k % qk == 0);
 
     const int nb = k / qk;
 
@@ -873,20 +872,16 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
 
         y[i].d = d;
 
-        uint64_t qs[QK4_0 / 16] = {0};
-
         for (int l = 0; l < qk/2; ++l) {
             const float x0 = x[i*qk + 0    + l]*id;
             const float x1 = x[i*qk + qk/2 + l]*id;
 
-            const uint64_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
-            const uint64_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
 
-            qs[l/8] |= xi0 << (8*(l & 7));
-            qs[l/8] |= xi1 << (8*(l & 7) + 4);
+            y[i].qs[l]  = xi0;
+            y[i].qs[l] |= xi1 << 4;
         }
-
-        memcpy(y[i].qs, qs, qk/2);
     }
 }
 
@@ -897,8 +892,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k
 static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
     const int qk = QK4_1;
 
-    assert(qk / 16 == 0);
-    assert( k % qk == 0);
+    assert(k % qk == 0);
 
     const int nb = k / qk;
 
@@ -919,20 +913,16 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
         y[i].d = d;
         y[i].m = min;
 
-        uint64_t qs[QK4_1 / 16] = {0};
-
         for (int l = 0; l < qk/2; ++l) {
             const float x0 = (x[0    + l] - min)*id;
             const float x1 = (x[qk/2 + l] - min)*id;
 
-            const uint64_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
-            const uint64_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
 
-            qs[l/8] |= xi0 << (8*(l & 7));
-            qs[l/8] |= xi1 << (8*(l & 7) + 4);
+            y[i].qs[l]  = xi0;
+            y[i].qs[l] |= xi1 << 4;
         }
-
-        memcpy(y[i].qs, qs, qk/2);
     }
 }
 
@@ -944,8 +934,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k
 static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) {
     static const int qk = QK4_2;
 
-    assert(qk / 16 == 0);
-    assert( k % qk == 0);
+    assert(k % qk == 0);
 
     const int nb = k / qk;
 
@@ -990,8 +979,7 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k
 static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
     static const int qk = QK5_0;
 
-    assert(qk / 16 == 0);
-    assert( k % qk == 0);
+    assert(k % qk == 0);
 
     const int nb = k / qk;
 
@@ -1013,24 +1001,21 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
         y[i].d = d;
 
         uint32_t qh = 0;
-        uint64_t qs[QK5_0 / 16] = {0};
 
         for (int l = 0; l < qk/2; ++l) {
             const float x0 = x[i*qk + 0    + l]*id;
             const float x1 = x[i*qk + qk/2 + l]*id;
 
-            const uint64_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
-            const uint64_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
+            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
+            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
 
-            qs[l/8] |= xi0 << (8*(l & 7));
-            qs[l/8] |= xi1 << (8*(l & 7) + 4);
+            y[i].qs[l] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
 
             // get the 5-th bit and store it in qh at the right position
             qh |= ((xi0 & 0x10) >> 4) << (l + 0);
             qh |= ((xi1 & 0x10) >> 4) << (l + qk/2);
         }
 
-        memcpy( y[i].qs,  qs, qk/2);
         memcpy(&y[i].qh, &qh, sizeof(qh));
     }
 }
@@ -1040,20 +1025,24 @@ static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k
 }
 
 static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
-    assert(k % QK5_1 == 0);
-    const int nb = k / QK5_1;
+    const int qk = QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
         float min = FLT_MAX;
         float max = -FLT_MAX;
 
-        for (int l = 0; l < QK5_1; l++) {
-            const float v = x[i*QK5_1 + l];
+        for (int l = 0; l < qk; l++) {
+            const float v = x[i*qk + l];
+
             if (v < min) min = v;
             if (v > max) max = v;
         }
 
-        const float d = (max - min) / ((1 << 5) - 1);
+        const float d  = (max - min) / ((1 << 5) - 1);
         const float id = d ? 1.0f/d : 0.0f;
 
         y[i].d = GGML_FP32_TO_FP16(d);
@@ -1061,29 +1050,25 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
 
         uint32_t qh = 0;
 
-        for (int l = 0; l < QK5_1; l += 2) {
-            const float v0 = (x[i*QK5_1 + l + 0] - min)*id;
-            const float v1 = (x[i*QK5_1 + l + 1] - min)*id;
+        for (int l = 0; l < qk/2; ++l) {
+            const float x0 = (x[i*qk + 0    + l] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + l] - min)*id;
 
-            const uint32_t vi0 = (int) (v0 + 0.5f);
-            const uint32_t vi1 = (int) (v1 + 0.5f);
+            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
 
-            y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
+            y[i].qs[l] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
 
             // get the 5-th bit and store it in qh at the right position
-            qh |= ((vi0 & 0x10) >> 4) << (l + 0);
-            qh |= ((vi1 & 0x10) >> 4) << (l + 1);
+            qh |= ((xi0 & 0x10) >> 4) << (l + 0);
+            qh |= ((xi1 & 0x10) >> 4) << (l + qk/2);
         }
 
         memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
     }
 }
 
-static void quantize_row_q5_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK5_1 == 0);
-
-    block_q5_1 * restrict y = vy;
-
+static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) {
     quantize_row_q5_1_reference(x, y, k);
 }
 
@@ -1443,8 +1428,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
 static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) {
     static const int qk = QK4_0;
 
-    assert(qk / 16 == 0);
-    assert( k % qk == 0);
+    assert(k % qk == 0);
 
     const int nb = k / qk;
 
@@ -1464,8 +1448,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
 static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) {
     static const int qk = QK4_1;
 
-    assert(qk / 16 == 0);
-    assert( k % qk == 0);
+    assert(k % qk == 0);
 
     const int nb = k / qk;
 
@@ -1487,8 +1470,7 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict
     // BORKEN !!!
     static const int qk = QK4_2;
 
-    assert(qk / 16 == 0);
-    assert( k % qk == 0);
+    assert(k % qk == 0);
 
     const int nb = k / qk;
 
@@ -1508,8 +1490,7 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict
 static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
     static const int qk = QK4_0;
 
-    assert(qk / 16 == 0);
-    assert( k % qk == 0);
+    assert(k % qk == 0);
 
     const int nb = k / qk;
 
@@ -1532,39 +1513,29 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
     }
 }
 
-static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK5_1 == 0);
-    const int nb = k / QK5_1;
+static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) {
+    static const int qk = QK5_1;
 
-    const block_q5_1 * restrict x = vx;
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
         const float d = GGML_FP16_TO_FP32(x[i].d);
         const float m = GGML_FP16_TO_FP32(x[i].m);
 
-        const uint8_t * restrict pp = x[i].qs;
-
         uint32_t qh;
         memcpy(&qh, x[i].qh, sizeof(qh));
 
-        for (int l = 0; l < QK5_1; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            // extract the 5-th bit from qh
-            const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-            const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
-
-            const uint8_t vi0 = (vi & 0x0F) | vh0;
-            const uint8_t vi1 = (vi >>   4) | vh1;
-
-            const float v0 = vi0*d + m;
-            const float v1 = vi1*d + m;
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-            y[i*QK5_1 + l + 0] = v0;
-            y[i*QK5_1 + l + 1] = v1;
+            const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+            const int x1 = (x[i].qs[j] >>  4) | xh_1;
 
-            assert(!isnan(y[i*QK5_1 + l + 0]));
-            assert(!isnan(y[i*QK5_1 + l + 1]));
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
         }
     }
 }
@@ -1627,7 +1598,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q5_1] = {
-        .dequantize_row_q         = dequantize_row_q5_1,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q5_1,
         .quantize_row_q           = quantize_row_q5_1,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference,
         .quantize_row_q_dot       = quantize_row_q8_1,
@@ -2875,11 +2846,12 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
 }
 
 static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_1;
+    const int qk = QK8_1;
+    const int nb = n / qk;
 
-    assert(n % QK8_1 == 0);
+    assert(n % qk == 0);
     assert(nb % 2 == 0);
-    assert(QK8_1 == QK5_1);
+    assert(qk == QK5_1);
 
     const block_q5_1 * restrict x = vx;
     const block_q8_1 * restrict y = vy;
@@ -2915,13 +2887,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8  (v0, vdupq_n_u8(0x0F)));
         const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
 
-        // interleave
-        const int8x16_t v0lz = vzip1q_s8(v0l, v0h);
-        const int8x16_t v0hz = vzip2q_s8(v0l, v0h);
-
         // add
-        const int8x16_t v0lf = vorrq_s8(v0lz, qhl);
-        const int8x16_t v0hf = vorrq_s8(v0hz, qhh);
+        const int8x16_t v0lf = vorrq_s8(v0l, qhl);
+        const int8x16_t v0hf = vorrq_s8(v0h, qhh);
 
         // load y
         const int8x16_t v1l = vld1q_s8(y0->qs);
@@ -3044,36 +3012,28 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 
     *s = hsum_float_8(acc) + summs;
 #else
+    // scalar
     float sumf = 0.0;
 
     for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[i].qs;
-        const  int8_t * restrict y0 = y[i].qs;
+        const int8_t * py = y[i].qs;
 
         uint32_t qh;
         memcpy(&qh, x[i].qh, sizeof(qh));
 
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        int sxy = 0;
-
-        for (int j = 0; j < QK8_1/2; j++) {
-            const uint8_t v0 = x0[j];
-
-            const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
-            const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
+        int sumi = 0;
 
-            const int x0_0 = (v0 & 0x0F) | x0_0h;
-            const int x1_0 = (v0 >>   4) | x1_0h;
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-            const int y0_0 = y0[2*j + 0];
-            const int y1_0 = y0[2*j + 1];
+            const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
 
-            sxy += x0_0*y0_0 + x1_0*y1_0;
+            sumi += (x0 * py[j]) + (x1 * py[j + qk/2]);
         }
 
-        sumf += (d*sxy)*y[i].d + m*(y[i].s0 + y[i].s1);
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*(y[i].s0 + y[i].s1);
     }
 
     *s = sumf;

From caaacd576552db0a76cd89ca18b0940e80302ead Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 5 May 2023 17:12:58 +0300
Subject: [PATCH 08/32] ggml : simplify scalar dot

---
 ggml.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/ggml.c b/ggml.c
index cb321455b4dd2..1e23d3ec83977 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2363,15 +2363,13 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
     float sumf = 0.0;
 
     for (int i = 0; i < nb; i++) {
-        const int8_t * py = y[i].qs;
-
         int sumi = 0;
 
         for (int j = 0; j < qk/2; ++j) {
             const int v0 = (x[i].qs[j] & 0xf) - 8;
             const int v1 = (x[i].qs[j] >>  4) - 8;
 
-            sumi += (v0 * py[j]) + (v1 * py[j + qk/2]);
+            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
         }
 
         sumf += (x[i].d*y[i].d)*sumi;
@@ -2487,15 +2485,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
     float sumf = 0.0;
 
     for (int i = 0; i < nb; i++) {
-        const int8_t * py = y[i].qs;
-
         int sumi = 0;
 
         for (int j = 0; j < qk/2; ++j) {
             const int v0 = (x[i].qs[j] & 0xf);
             const int v1 = (x[i].qs[j] >>  4);
 
-            sumi += (v0 * py[j]) + (v1 * py[j + qk/2]);
+            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
         }
 
         sumf += (x[i].d*y[i].d)*sumi + x[i].m*(y[i].s0 + y[i].s1);
@@ -2821,8 +2817,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
     float sumf = 0.0;
 
     for (int i = 0; i < nb; i++) {
-        const int8_t * py = y[i].qs;
-
         uint32_t qh;
         memcpy(&qh, x[i].qh, sizeof(qh));
 
@@ -2835,7 +2829,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
             const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
             const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
 
-            sumi += (x0 * py[j]) + (x1 * py[j + qk/2]);
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
         }
 
         sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
@@ -3016,8 +3010,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
     float sumf = 0.0;
 
     for (int i = 0; i < nb; i++) {
-        const int8_t * py = y[i].qs;
-
         uint32_t qh;
         memcpy(&qh, x[i].qh, sizeof(qh));
 
@@ -3030,7 +3022,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
             const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
             const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
 
-            sumi += (x0 * py[j]) + (x1 * py[j + qk/2]);
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
         }
 
         sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*(y[i].s0 + y[i].s1);

From 0add6402bdbc825c27a101730756a7cb037da5ff Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 5 May 2023 17:23:41 +0300
Subject: [PATCH 09/32] ggml : remove WASM SIMD bit shuffling + remove vzip for
 ARM 32-bit

---
 ggml.c | 120 +++++----------------------------------------------------
 1 file changed, 9 insertions(+), 111 deletions(-)

diff --git a/ggml.c b/ggml.c
index 1e23d3ec83977..ed6e4742bc8b8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -689,94 +689,6 @@ float vmaxvq_f32(float32x4_t v) {
             MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
 }
 
-int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
-    int8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
-    int8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
-    int8x16_t res;
-
-    res[0]  = a[0]; res[1]  = b[0]; res[2]  = a[1]; res[3]  = b[1];
-    res[4]  = a[2]; res[5]  = b[2]; res[6]  = a[3]; res[7]  = b[3];
-    res[8]  = a[4]; res[9]  = b[4]; res[10] = a[5]; res[11] = b[5];
-    res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
-
-    return res;
-}
-
-int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
-    int8x16_t res;
-
-    res[0]  = a[8];  res[1]  = b[8];  res[2]  = a[9];  res[3]  = b[9];
-    res[4]  = a[10]; res[5]  = b[10]; res[6]  = a[11]; res[7]  = b[11];
-    res[8]  = a[12]; res[9]  = b[12]; res[10] = a[13]; res[11] = b[13];
-    res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
-
-    return res;
-}
-
-uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
-    uint8x16_t res;
-
-    res[0]  = a[0];  res[1]  = b[0];  res[2]  = a[1];  res[3]  = b[1];
-    res[4]  = a[2];  res[5]  = b[2];  res[6]  = a[3];  res[7]  = b[3];
-    res[8]  = a[4];  res[9]  = b[4];  res[10] = a[5];  res[11] = b[5];
-    res[12] = a[6];  res[13] = b[6];  res[14] = a[7];  res[15] = b[7];
-
-    return res;
-}
-
-uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
-    uint8x16_t res;
-
-    res[0]  = a[8];  res[1]  = b[8];  res[2]  = a[9];  res[3]  = b[9];
-    res[4]  = a[10]; res[5]  = b[10]; res[6]  = a[11]; res[7]  = b[11];
-    res[8]  = a[12]; res[9]  = b[12]; res[10] = a[13]; res[11] = b[13];
-    res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
-
-    return res;
-}
-
 int32x4_t vcvtnq_s32_f32(float32x4_t v) {
     int32x4_t res;
 
@@ -2753,13 +2665,9 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         const v128_t v0l = wasm_v128_and (v0, m4b);
         const v128_t v0h = wasm_u8x16_shr(v0, 4);
 
-        // interleave
-        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
-        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-
         // add high bit and sub 16
-        const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
-        const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
+        const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0l, qhl), s16b);
+        const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0h, qhh), s16b);
 
         // load y
         const v128_t v1l = wasm_v128_load(y0->qs);
@@ -2944,13 +2852,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 
         static bool x = true;
 
-        // interleave
-        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
-        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-
         // add high bit
-        const v128_t v0lf = wasm_v128_or(v0lz, qhl);
-        const v128_t v0hf = wasm_v128_or(v0hz, qhh);
+        const v128_t v0lf = wasm_v128_or(v0l, qhl);
+        const v128_t v0hf = wasm_v128_or(v0h, qhh);
 
         // load y
         const v128_t v1l = wasm_v128_load(y0->qs);
@@ -3033,11 +2937,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 }
 
 static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
+    const int qk = QK8_0;
+    const int nb = n / qk;
 
-    assert(n % QK8_0 == 0);
+    assert(n % qk == 0);
     assert(nb % 2 == 0);
-    assert(QK8_0 == QK8_0);
 
     const block_q8_0 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
@@ -3117,16 +3021,10 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
     float sumf = 0.0;
 
     for (int i = 0; i < nb; i++) {
-        const int8_t * restrict x0 = x[i].qs;
-        const int8_t * restrict y0 = y[i].qs;
-
         int sumi = 0;
 
-        for (int j = 0; j < QK8_0; j++) {
-            const int v0 = x0[j];
-            const int v1 = y0[j];
-
-            sumi += v0*v1;
+        for (int j = 0; j < qk; j++) {
+            sumi += x[i].qs[j]*y[i].qs[j];
         }
 
         sumf += (x[i].d*y[i].d)*sumi;

From 9472d0ea8bcee2a78178bd0070c20f8824c7378d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 7 May 2023 18:07:11 +0300
Subject: [PATCH 10/32] ggml : fix Q4_1 quantization

---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index ed6e4742bc8b8..5d00404890822 100644
--- a/ggml.c
+++ b/ggml.c
@@ -826,8 +826,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
         y[i].m = min;
 
         for (int l = 0; l < qk/2; ++l) {
-            const float x0 = (x[0    + l] - min)*id;
-            const float x1 = (x[qk/2 + l] - min)*id;
+            const float x0 = (x[i*qk + 0    + l] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + l] - min)*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));

From cdc960732900163fab4ce496fca5c6d2c687fe5c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 7 May 2023 18:23:59 +0300
Subject: [PATCH 11/32] ggml : update cuBLAS + normalize variable names

---
 ggml-cuda.cu |  84 +++++++------------
 ggml.c       | 233 ++++++++++++++++++++++++++-------------------------
 2 files changed, 149 insertions(+), 168 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 127b352a0f2c9..b1a9ffb3258a9 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -81,29 +81,26 @@ typedef struct {
 static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
 
 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
+    static const int qk = QK4_0;
+
     const block_q4_0 * x = (const block_q4_0 *) vx;
 
     const int i = blockIdx.x;
 
     const float d = x[i].d;
 
-    const uint8_t * pp = x[i].qs;
-
-    for (int l = 0; l < QK4_0; l += 2) {
-        const uint8_t vi = pp[l/2];
-
-        const int8_t vi0 = vi & 0xf;
-        const int8_t vi1 = vi >> 4;
+    for (int j = 0; j < qk/2; ++j) {
+        const int x0 = (x[i].qs[j] & 0xf) - 8;
+        const int x1 = (x[i].qs[j] >>  4) - 8;
 
-        const float v0 = (vi0 - 8)*d;
-        const float v1 = (vi1 - 8)*d;
-
-        y[i*QK4_0 + l + 0] = v0;
-        y[i*QK4_0 + l + 1] = v1;
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
     }
 }
 
 static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
+    static const int qk = QK4_1;
+
     const block_q4_1 * x = (const block_q4_1 *) vx;
 
     const int i = blockIdx.x;
@@ -111,19 +108,12 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
     const float d = x[i].d;
     const float m = x[i].m;
 
-    const uint8_t * pp = x[i].qs;
-
-    for (int l = 0; l < QK4_1; l += 2) {
-        const uint8_t vi = pp[l/2];
-
-        const int8_t vi0 = vi & 0xf;
-        const int8_t vi1 = vi >> 4;
+    for (int j = 0; j < qk/2; ++j) {
+        const int x0 = (x[i].qs[j] & 0xf);
+        const int x1 = (x[i].qs[j] >>  4);
 
-        const float v0 = vi0*d + m;
-        const float v1 = vi1*d + m;
-
-        y[i*QK4_1 + l + 0] = v0;
-        y[i*QK4_1 + l + 1] = v1;
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
     }
 }
 
@@ -151,35 +141,32 @@ static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
 }
 
 static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
+    static const int qk = QK5_0;
+
     const block_q5_0 * x = (const block_q5_0 *) vx;
 
     const int i = blockIdx.x;
 
     const float d = x[i].d;
 
-    const uint8_t * pp = x[i].qs;
-
     uint32_t qh;
     memcpy(&qh, x[i].qh, sizeof(qh));
 
-    for (int l = 0; l < QK5_0; l += 2) {
-        const uint8_t vi = pp[l/2];
-
-        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+    for (int j = 0; j < qk/2; ++j) {
+        const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+        const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-        const int8_t vi0 = ((vi & 0xf) | vh0);
-        const int8_t vi1 = ((vi >>  4) | vh1);
+        const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+        const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
 
-        const float v0 = (vi0 - 16)*d;
-        const float v1 = (vi1 - 16)*d;
-
-        y[i*QK5_0 + l + 0] = v0;
-        y[i*QK5_0 + l + 1] = v1;
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
     }
 }
 
 static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
+    static const int qk = QK5_1;
+
     const block_q5_1 * x = (const block_q5_1 *) vx;
 
     const int i = blockIdx.x;
@@ -187,25 +174,18 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
     const float d = x[i].d;
     const float m = x[i].m;
 
-    const uint8_t * pp = x[i].qs;
-
     uint32_t qh;
     memcpy(&qh, x[i].qh, sizeof(qh));
 
-    for (int l = 0; l < QK5_1; l += 2) {
-        const uint8_t vi = pp[l/2];
-
-        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
-
-        const int8_t vi0 = (vi & 0xf) | vh0;
-        const int8_t vi1 = (vi >>  4) | vh1;
+    for (int j = 0; j < qk/2; ++j) {
+        const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+        const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-        const float v0 = vi0*d + m;
-        const float v1 = vi1*d + m;
+        const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+        const int x1 = (x[i].qs[j] >>  4) | xh_1;
 
-        y[i*QK5_1 + l + 0] = v0;
-        y[i*QK5_1 + l + 1] = v1;
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
     }
 }
 
diff --git a/ggml.c b/ggml.c
index 5d00404890822..817782179a9d3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -771,8 +771,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
         float amax = 0.0f; // absolute max
         float max  = 0.0f;
 
-        for (int l = 0; l < qk; l++) {
-            const float v = x[i*qk + l];
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
             if (amax < fabsf(v)) {
                 amax = fabsf(v);
                 max  = v;
@@ -784,15 +784,15 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
 
         y[i].d = d;
 
-        for (int l = 0; l < qk/2; ++l) {
-            const float x0 = x[i*qk + 0    + l]*id;
-            const float x1 = x[i*qk + qk/2 + l]*id;
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
 
-            y[i].qs[l]  = xi0;
-            y[i].qs[l] |= xi1 << 4;
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
         }
     }
 }
@@ -812,8 +812,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
         float min = FLT_MAX;
         float max = -FLT_MAX;
 
-        for (int l = 0; l < qk; l++) {
-            const float v = x[i*qk + l];
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
 
             if (v < min) min = v;
             if (v > max) max = v;
@@ -825,15 +825,15 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
         y[i].d = d;
         y[i].m = min;
 
-        for (int l = 0; l < qk/2; ++l) {
-            const float x0 = (x[i*qk + 0    + l] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + l] - min)*id;
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
 
-            y[i].qs[l]  = xi0;
-            y[i].qs[l] |= xi1 << 4;
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
         }
     }
 }
@@ -854,8 +854,8 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
         float amax = 0.0f; // absolute max
         float max  = 0.0f;
 
-        for (int l = 0; l < qk; l++) {
-            const float v = x[i*qk + l];
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
             if (amax < fabsf(v)) {
                 amax = fabsf(v);
                 max  = v;
@@ -869,15 +869,15 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
 
         uint64_t qs[QK4_2 / 16] = {0};
 
-        for (int l = 0; l < qk/2; ++l) {
-            const float x0 = x[i*qk + 0    + l]*id;
-            const float x1 = x[i*qk + qk/2 + l]*id;
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
 
             const uint64_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
             const uint64_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
 
-            qs[l/8] |= xi0 << (8*(l & 7));
-            qs[l/8] |= xi1 << (8*(l & 7) + 4);
+            qs[j/8] |= xi0 << (8*(j & 7));
+            qs[j/8] |= xi1 << (8*(j & 7) + 4);
         }
 
         memcpy(y[i].qs, qs, qk/2);
@@ -899,8 +899,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
         float amax = 0.0f; // absolute max
         float max  = 0.0f;
 
-        for (int l = 0; l < qk; l++) {
-            const float v = x[i*qk + l];
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
             if (amax < fabsf(v)) {
                 amax = fabsf(v);
                 max  = v;
@@ -914,18 +914,18 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
 
         uint32_t qh = 0;
 
-        for (int l = 0; l < qk/2; ++l) {
-            const float x0 = x[i*qk + 0    + l]*id;
-            const float x1 = x[i*qk + qk/2 + l]*id;
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
 
             const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
             const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
 
-            y[i].qs[l] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
 
             // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10) >> 4) << (l + 0);
-            qh |= ((xi1 & 0x10) >> 4) << (l + qk/2);
+            qh |= ((xi0 & 0x10) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
         }
 
         memcpy(&y[i].qh, &qh, sizeof(qh));
@@ -947,8 +947,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
         float min = FLT_MAX;
         float max = -FLT_MAX;
 
-        for (int l = 0; l < qk; l++) {
-            const float v = x[i*qk + l];
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
 
             if (v < min) min = v;
             if (v > max) max = v;
@@ -962,18 +962,18 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
 
         uint32_t qh = 0;
 
-        for (int l = 0; l < qk/2; ++l) {
-            const float x0 = (x[i*qk + 0    + l] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + l] - min)*id;
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
 
             const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
             const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
 
-            y[i].qs[l] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
 
             // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10) >> 4) << (l + 0);
-            qh |= ((xi1 & 0x10) >> 4) << (l + qk/2);
+            qh |= ((xi0 & 0x10) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
         }
 
         memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
@@ -992,8 +992,8 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
 
-        for (int l = 0; l < QK8_0; l++) {
-            const float v = x[i*QK8_0 + l];
+        for (int j = 0; j < QK8_0; j++) {
+            const float v = x[i*QK8_0 + j];
             amax = MAX(amax, fabsf(v));
         }
 
@@ -1002,10 +1002,10 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
 
         y[i].d = d;
 
-        for (int l = 0; l < QK8_0; ++l) {
-            const float v0 = x[i*QK8_0 + l]*id;
+        for (int j = 0; j < QK8_0; ++j) {
+            const float v0 = x[i*QK8_0 + j]*id;
 
-            y[i].qs[l] = roundf(v0);
+            y[i].qs[j] = roundf(v0);
         }
     }
 }
@@ -1146,8 +1146,8 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
 
-        for (int l = 0; l < QK8_1; l++) {
-            const float v = x[i*QK8_1 + l];
+        for (int j = 0; j < QK8_1; j++) {
+            const float v = x[i*QK8_1 + j];
             amax = MAX(amax, fabsf(v));
         }
 
@@ -1159,15 +1159,15 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
         int sum0 = 0;
         int sum1 = 0;
 
-        for (int l = 0; l < QK8_1/2; ++l) {
-            const float v0 = x[i*QK8_1           + l]*id;
-            const float v1 = x[i*QK8_1 + QK8_1/2 + l]*id;
+        for (int j = 0; j < QK8_1/2; ++j) {
+            const float v0 = x[i*QK8_1           + j]*id;
+            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
 
-            y[i].qs[          l] = roundf(v0);
-            y[i].qs[QK8_1/2 + l] = roundf(v1);
+            y[i].qs[          j] = roundf(v0);
+            y[i].qs[QK8_1/2 + j] = roundf(v1);
 
-            sum0 += y[i].qs[          l];
-            sum1 += y[i].qs[QK8_1/2 + l];
+            sum0 += y[i].qs[          j];
+            sum1 += y[i].qs[QK8_1/2 + j];
         }
 
         y[i].s0 = d * sum0;
@@ -1187,12 +1187,12 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         float32x4_t asrcv[8];
         float32x4_t amaxv[8];
 
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
 
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
 
         const float amax = vmaxvq_f32(amaxv[0]);
 
@@ -1205,27 +1205,27 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         int32x4_t accv1 = vdupq_n_s32(0);
 
         // low half
-        for (int l = 0; l < 4; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
+        for (int j = 0; j < 4; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
 
             accv0 = vaddq_s32(accv0, vi);
         }
 
         // high half
-        for (int l = 4; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
+        for (int j = 4; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
 
             accv1 = vaddq_s32(accv1, vi);
         }
@@ -1393,14 +1393,14 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict
 
         const uint8_t * qsp = b4_from_nibbles_64(qk, x[i].qs, qs);
 
-        for (int l = 0; l < qk; ++l) {
-            y[i*qk + l] = (qsp[l] - 8)*d;
+        for (int j = 0; j < qk; ++j) {
+            y[i*qk + j] = (qsp[j] - 8)*d;
         }
     }
 }
 
 static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
-    static const int qk = QK4_0;
+    static const int qk = QK5_0;
 
     assert(k % qk == 0);
 
@@ -1453,18 +1453,19 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
 }
 
 static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
+    static const int qk = QK8_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
 
     const block_q8_0 * restrict x = vx;
 
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
 
-        const int8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK8_0; ++l) {
-            y[i*QK8_0 + l] = pp[l]*d;
+        for (int j = 0; j < qk; ++j) {
+            y[i*qk + j] = x[i].qs[j]*d;
         }
     }
 }
@@ -12314,15 +12315,15 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
     assert(k % QK4_0 == 0);
     const int nb = k / QK4_0;
 
-    for (int j = 0; j < n; j += k) {
-        block_q4_0 * restrict y = (block_q4_0 *) dst + j/QK4_0;
+    for (int b = 0; b < n; b += k) {
+        block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
 
-        quantize_row_q4_0_reference(src + j, y, k);
+        quantize_row_q4_0_reference(src + b, y, k);
 
         for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_0; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
+            for (int j = 0; j < QK4_0; j += 2) {
+                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                const uint8_t vi1 = y[i].qs[j/2] >> 4;
 
                 hist[vi0]++;
                 hist[vi1]++;
@@ -12337,15 +12338,15 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
     assert(k % QK4_1 == 0);
     const int nb = k / QK4_1;
 
-    for (int j = 0; j < n; j += k) {
-        block_q4_1 * restrict y = (block_q4_1 *) dst + j/QK4_1;
+    for (int b = 0; b < n; b += k) {
+        block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
 
-        quantize_row_q4_1_reference(src + j, y, k);
+        quantize_row_q4_1_reference(src + b, y, k);
 
         for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_1; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
+            for (int j = 0; j < QK4_1; j += 2) {
+                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                const uint8_t vi1 = y[i].qs[j/2] >> 4;
 
                 hist[vi0]++;
                 hist[vi1]++;
@@ -12360,15 +12361,15 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
     assert(k % QK4_2 == 0);
     const int nb = k / QK4_2;
 
-    for (int j = 0; j < n; j += k) {
-        block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2;
+    for (int b = 0; b < n; b += k) {
+        block_q4_2 * restrict y = (block_q4_2 *)dst + b/QK4_2;
 
-        quantize_row_q4_2_reference(src + j, y, k);
+        quantize_row_q4_2_reference(src + b, y, k);
 
         for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_2; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
+            for (int j = 0; j < QK4_2; j += 2) {
+                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                const uint8_t vi1 = y[i].qs[j/2] >> 4;
 
                 hist[vi0]++;
                 hist[vi1]++;
@@ -12383,22 +12384,22 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
     assert(k % QK5_0 == 0);
     const int nb = k / QK5_0;
 
-    for (int j = 0; j < n; j += k) {
-        block_q5_0 * restrict y = (block_q5_0 *)dst + j/QK5_0;
+    for (int b = 0; b < n; b += k) {
+        block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
 
-        quantize_row_q5_0_reference(src + j, y, k);
+        quantize_row_q5_0_reference(src + b, y, k);
 
         for (int i = 0; i < nb; i++) {
             uint32_t qh;
             memcpy(&qh, &y[i].qh, sizeof(qh));
 
-            for (int l = 0; l < QK5_0; l += 2) {
-                const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
+            for (int j = 0; j < QK5_0; j += 2) {
+                const uint8_t vh0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j + 1))) >> (j + 1)) << 4;
 
                 // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[l/2] >>   4) | vh1) / 2;
+                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
+                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
 
                 hist[vi0]++;
                 hist[vi1]++;
@@ -12413,22 +12414,22 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
     assert(k % QK5_1 == 0);
     const int nb = k / QK5_1;
 
-    for (int j = 0; j < n; j += k) {
-        block_q5_1 * restrict y = (block_q5_1 *)dst + j/QK5_1;
+    for (int b = 0; b < n; b += k) {
+        block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
 
-        quantize_row_q5_1_reference(src + j, y, k);
+        quantize_row_q5_1_reference(src + b, y, k);
 
         for (int i = 0; i < nb; i++) {
             uint32_t qh;
             memcpy(&qh, &y[i].qh, sizeof(qh));
 
-            for (int l = 0; l < QK5_1; l += 2) {
-                const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
+            for (int j = 0; j < QK5_1; j += 2) {
+                const uint8_t vh0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j + 1))) >> (j + 1)) << 4;
 
                 // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[l/2] >>   4) | vh1) / 2;
+                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
+                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
 
                 hist[vi0]++;
                 hist[vi1]++;
@@ -12443,14 +12444,14 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
-    for (int j = 0; j < n; j += k) {
-        block_q8_0 * restrict y = (block_q8_0 *)dst + j/QK8_0;
+    for (int b = 0; b < n; b += k) {
+        block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
 
-        quantize_row_q8_0_reference(src + j, y, k);
+        quantize_row_q8_0_reference(src + b, y, k);
 
         for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK8_0; ++l) {
-                const int8_t vi = y[i].qs[l];
+            for (int j = 0; j < QK8_0; ++j) {
+                const int8_t vi = y[i].qs[j];
 
                 hist[vi/16 + 8]++;
             }

From 4bf1c8a43e26ac706e1ca8cf78fa12e7203cda89 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 7 May 2023 18:26:59 +0300
Subject: [PATCH 12/32] ggml : remove Q4_2 mode

---
 examples/quantize/quantize.cpp |  11 +-
 ggml-cuda.cu                   |  37 -----
 ggml-opencl.c                  |  30 +---
 ggml.c                         | 286 ---------------------------------
 ggml.h                         |   4 +-
 llama.cpp                      |   4 -
 llama.h                        |   2 +-
 7 files changed, 8 insertions(+), 366 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 7c77018daa344..115d8fb1ba36b 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -7,12 +7,11 @@
 #include <string>
 
 static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
-    {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
-    {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
-    {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
-    {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
-    {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
-    {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
+  {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
+  {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
+  {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
+  {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
+  {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
 };
 
 bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index b1a9ffb3258a9..46f7b568c608b 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -49,13 +49,6 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
 
-#define QK4_2 16
-typedef struct {
-    half  d;                // delta
-    uint8_t qs[QK4_2 / 2];  // nibbles / quants
-} block_q4_2;
-static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
-
 #define QK5_0 32
 typedef struct {
     half d;                 // delta
@@ -117,29 +110,6 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
     }
 }
 
-static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
-    const block_q4_2 * x = (const block_q4_2 *) vx;
-
-    const int i = blockIdx.x;
-
-    const float d = x[i].d;
-
-    const uint8_t * pp = x[i].qs;
-
-    for (int l = 0; l < QK4_2; l += 2) {
-        const uint8_t vi = pp[l/2];
-
-        const int8_t vi0 = vi & 0xf;
-        const int8_t vi1 = vi >> 4;
-
-        const float v0 = (vi0 - 8)*d;
-        const float v1 = (vi1 - 8)*d;
-
-        y[i*QK4_2 + l + 0] = v0;
-        y[i*QK4_2 + l + 1] = v1;
-    }
-}
-
 static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
     static const int qk = QK5_0;
 
@@ -215,11 +185,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
     dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-    const int nb = k / QK4_2;
-    dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
-}
-
 static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK5_0;
     dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
@@ -254,8 +219,6 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
             return dequantize_row_q4_0_cuda;
         case GGML_TYPE_Q4_1:
             return dequantize_row_q4_1_cuda;
-        case GGML_TYPE_Q4_2:
-            return dequantize_row_q4_2_cuda;
         case GGML_TYPE_Q5_0:
             return dequantize_row_q5_0_cuda;
         case GGML_TYPE_Q5_1:
diff --git a/ggml-opencl.c b/ggml-opencl.c
index 4389eca393466..0e6e6770f6307 100644
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -52,26 +52,6 @@ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global f
     result[index + 1] = (vi >> 4) * d + m;
 }
 
-struct block_q4_2
-{
-    ushort d;
-    uchar qs[8];
-};
-
-__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 16;
-    const uint l = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*16 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
-
-
 struct block_q5_0
 {
     float d;
@@ -167,7 +147,7 @@ static cl_device_id device;
 static cl_context context;
 static cl_command_queue queue;
 static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
 
@@ -238,8 +218,6 @@ void ggml_cl_init(void) {
     CL_CHECK(err, "clCreateKernel");
     kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
     CL_CHECK(err, "clCreateKernel");
-    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
-    CL_CHECK(err, "clCreateKernel");
     kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
     CL_CHECK(err, "clCreateKernel");
     kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
@@ -292,12 +270,6 @@ void ggml_cl_sgemm_wrapper(
         local = 16;
         size_qb = global * (sizeof(float) * 2 + local) / 32;
         break;
-    case GGML_TYPE_Q4_2:
-        dequant = true;
-        kernel = kernel_q4_2;
-        local = 8;
-        size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
-        break;
     case GGML_TYPE_Q5_0:
         dequant = true;
         kernel = kernel_q5_0;
diff --git a/ggml.c b/ggml.c
index 817782179a9d3..ea73bf8441e6f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -615,18 +615,6 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
 
 #if __ARM_NEON
 
-// TODO: obosolete - will be removed
-static inline const uint8_t * b4_from_nibbles_64(const int qk, const uint8_t * qs, uint64_t * qd) {
-    memcpy(qd, qs, qk/2);
-
-    for (int l = 0; l < qk/16; ++l) {
-        qd[l + qk/16] = (qd[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4;
-        qd[l + 0    ] = (qd[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0;
-    }
-
-    return (const uint8_t *) qd;
-}
-
 #if !defined(__aarch64__)
 
 inline static uint16_t vaddvq_u8(uint8x16_t v) {
@@ -719,13 +707,6 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding");
 
-#define QK4_2 16
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qs[QK4_2 / 2]; // nibbles / quants
-} block_q4_2;
-static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
-
 #define QK5_0 32
 typedef struct {
     ggml_fp16_t d;         // delta
@@ -842,52 +823,6 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k
     quantize_row_q4_1_reference(x, y, k);
 }
 
-// reference implementation for deterministic creation of model files
-static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) {
-    static const int qk = QK4_2;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max  = v;
-            }
-        }
-
-        const float d  = max / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        uint64_t qs[QK4_2 / 16] = {0};
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
-
-            const uint64_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
-            const uint64_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
-
-            qs[j/8] |= xi0 << (8*(j & 7));
-            qs[j/8] |= xi1 << (8*(j & 7) + 4);
-        }
-
-        memcpy(y[i].qs, qs, qk/2);
-    }
-}
-
-static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k) {
-    quantize_row_q4_2_reference(x, y, k);
-}
-
 static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
     static const int qk = QK5_0;
 
@@ -1378,27 +1313,6 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
     }
 }
 
-static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict y, int k) {
-    // BORKEN !!!
-    static const int qk = QK4_2;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    uint64_t qs[QK4_2 / 8];
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * qsp = b4_from_nibbles_64(qk, x[i].qs, qs);
-
-        for (int j = 0; j < qk; ++j) {
-            y[i*qk + j] = (qsp[j] - 8)*d;
-        }
-    }
-}
-
 static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) {
     static const int qk = QK5_0;
 
@@ -1472,7 +1386,6 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
 
 static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -1494,14 +1407,6 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_q                = ggml_vec_dot_q4_1_q8_1,
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
-    [GGML_TYPE_Q4_2] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_2,
-        .quantize_row_q           = quantize_row_q4_2,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q4_2_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
     [GGML_TYPE_Q5_0] = {
         .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q5_0,
         .quantize_row_q           = quantize_row_q5_0,
@@ -2414,159 +2319,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
 #endif
 }
 
-static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int qk = QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(nb % 2 == 0);
-
-    assert(qk == 2*QK4_2);
-
-    const block_q4_2 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0];
-        const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1];
-        const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0];
-        const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1];
-
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b   = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b   = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
-        const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs));
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hs, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hs, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_FP16_TO_FP32(x0_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_FP16_TO_FP32(x1_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
-        const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
-        const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d));
-
-        __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
-        __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
-        __m256i bx = _mm256_set_m128i(bx1, bx0);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8(8);
-        bx = _mm256_sub_epi8(bx, off);
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[2*i + 0].qs;
-        const uint8_t * restrict x1 = x[2*i + 1].qs;
-        const  int8_t * restrict y0 = y[i].qs;
-
-        const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
-        const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
-
-        int sumi_0 = 0;
-        int sumi_1 = 0;
-
-        for (int j = 0; j < QK8_0/4; j++) {
-            const uint8_t v0 = x0[j];
-            const uint8_t v1 = x1[j];
-
-            const int i0_0 = (int8_t) (v0 & 0x0F) - 8;
-            const int i1_0 = (int8_t) (v0 >>   4) - 8;
-
-            const int i0_1 = (int8_t) (v1 & 0x0F) - 8;
-            const int i1_1 = (int8_t) (v1 >>   4) - 8;
-
-            const int i2_0 = y0[2*j + 0];
-            const int i3_0 = y0[2*j + 1];
-
-            const int i2_1 = y0[2*(j + QK8_0/4) + 0];
-            const int i3_1 = y0[2*(j + QK8_0/4) + 1];
-
-            sumi_0 += i0_0*i2_0 + i1_0*i3_0;
-            sumi_1 += i0_1*i2_1 + i1_1*i3_1;
-        }
-
-        sumf += (d0 * y[i].d) * sumi_0;
-        sumf += (d1 * y[i].d) * sumi_1;
-    }
-    *s = sumf;
-#endif
-}
-
 static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     const int qk = QK8_0;
     const int nb = n / qk;
@@ -3289,7 +3041,6 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F16]  = 1,
     [GGML_TYPE_Q4_0] = QK4_0,
     [GGML_TYPE_Q4_1] = QK4_1,
-    [GGML_TYPE_Q4_2] = QK4_2,
     [GGML_TYPE_Q5_0] = QK5_0,
     [GGML_TYPE_Q5_1] = QK5_1,
     [GGML_TYPE_Q8_0] = QK8_0,
@@ -3305,7 +3056,6 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
     [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
     [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
-    [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
     [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
     [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
     [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
@@ -3322,7 +3072,6 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F16]  = "f16",
     [GGML_TYPE_Q4_0] = "q4_0",
     [GGML_TYPE_Q4_1] = "q4_1",
-    [GGML_TYPE_Q4_2] = "q4_2",
     [GGML_TYPE_Q5_0] = "q5_0",
     [GGML_TYPE_Q5_1] = "q5_1",
     [GGML_TYPE_Q8_0] = "q8_0",
@@ -3338,7 +3087,6 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F16]  = false,
     [GGML_TYPE_Q4_0] = true,
     [GGML_TYPE_Q4_1] = true,
-    [GGML_TYPE_Q4_2] = true,
     [GGML_TYPE_Q5_0] = true,
     [GGML_TYPE_Q5_1] = true,
     [GGML_TYPE_Q8_0] = true,
@@ -3623,7 +3371,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
         case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
         case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
         case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
-        case GGML_FTYPE_MOSTLY_Q4_2:          wtype = GGML_TYPE_Q4_2;  break;
         case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
         case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
         case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
@@ -6624,7 +6371,6 @@ static void ggml_compute_forward_add(
             } break;
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
@@ -8179,7 +7925,6 @@ static void ggml_compute_forward_mul_mat(
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
@@ -8410,7 +8155,6 @@ static void ggml_compute_forward_get_rows(
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
@@ -8735,7 +8479,6 @@ static void ggml_compute_forward_alibi(
             } break;
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
@@ -12357,29 +12100,6 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK4_1*sizeof(block_q4_1));
 }
 
-size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_2 == 0);
-    const int nb = k / QK4_2;
-
-    for (int b = 0; b < n; b += k) {
-        block_q4_2 * restrict y = (block_q4_2 *)dst + b/QK4_2;
-
-        quantize_row_q4_2_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < QK4_2; j += 2) {
-                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_2*sizeof(block_q4_2));
-}
-
 size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK5_0 == 0);
     const int nb = k / QK5_0;
@@ -12476,12 +12196,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                 block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
                 result = ggml_quantize_q4_1(src + start, block, n, n, hist);
             } break;
-        case GGML_TYPE_Q4_2:
-            {
-                GGML_ASSERT(start % QK4_2 == 0);
-                block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
-                result = ggml_quantize_q4_2(src + start, block, n, n, hist);
-            } break;
         case GGML_TYPE_Q5_0:
             {
                 GGML_ASSERT(start % QK5_0 == 0);
diff --git a/ggml.h b/ggml.h
index 508dd69b41713..bb9a025e257d5 100644
--- a/ggml.h
+++ b/ggml.h
@@ -231,7 +231,7 @@ extern "C" {
         GGML_TYPE_F16  = 1,
         GGML_TYPE_Q4_0 = 2,
         GGML_TYPE_Q4_1 = 3,
-        GGML_TYPE_Q4_2 = 4,
+        // GGML_TYPE_Q4_2 = 4, support has been removed
         // GGML_TYPE_Q4_3 (5) support has been removed
         GGML_TYPE_Q5_0 = 6,
         GGML_TYPE_Q5_1 = 7,
@@ -251,7 +251,6 @@ extern "C" {
         GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
@@ -876,7 +875,6 @@ extern "C" {
 
     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
diff --git a/llama.cpp b/llama.cpp
index 4bba93a111ae4..5c6c3e72211fc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -482,7 +482,6 @@ struct llama_file_loader {
                 case GGML_TYPE_F16:
                 case GGML_TYPE_Q4_0:
                 case GGML_TYPE_Q4_1:
-                case GGML_TYPE_Q4_2:
                 case GGML_TYPE_Q5_0:
                 case GGML_TYPE_Q5_1:
                 case GGML_TYPE_Q8_0:
@@ -558,7 +557,6 @@ struct llama_file_saver {
             case GGML_TYPE_F16:
             case GGML_TYPE_Q4_0:
             case GGML_TYPE_Q4_1:
-            case GGML_TYPE_Q4_2:
             case GGML_TYPE_Q5_0:
             case GGML_TYPE_Q5_1:
             case GGML_TYPE_Q8_0:
@@ -852,7 +850,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
                                       return "mostly Q4_1, some F16";
-        case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
         case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
         case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
         case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -1905,7 +1902,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     switch (ftype) {
         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
         case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
         case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
         case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
diff --git a/llama.h b/llama.h
index 58c6e0699a999..fea5ffeffbee0 100644
--- a/llama.h
+++ b/llama.h
@@ -78,7 +78,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // support has been removed
         // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
         LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors

From b08c39b16c5f52ca656a92fe46a994868a87b082 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 7 May 2023 20:00:01 +0300
Subject: [PATCH 13/32] ggml : minor formatting

---
 ggml.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/ggml.c b/ggml.c
index ea73bf8441e6f..7ab747ca3a801 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1283,8 +1283,8 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
         const float d = x[i].d;
 
         for (int j = 0; j < qk/2; ++j) {
-            const int x0 = (x[i].qs[j] & 0xf) - 8;
-            const int x1 = (x[i].qs[j] >>  4) - 8;
+            const int x0 = (x[i].qs[j] & 0x0F) - 8;
+            const int x1 = (x[i].qs[j] >>   4) - 8;
 
             y[i*qk + j + 0   ] = x0*d;
             y[i*qk + j + qk/2] = x1*d;
@@ -1304,8 +1304,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
         const float m = x[i].m;
 
         for (int j = 0; j < qk/2; ++j) {
-            const int x0 = (x[i].qs[j] & 0xf);
-            const int x1 = (x[i].qs[j] >>  4);
+            const int x0 = (x[i].qs[j] & 0x0F);
+            const int x1 = (x[i].qs[j] >>   4);
 
             y[i*qk + j + 0   ] = x0*d + m;
             y[i*qk + j + qk/2] = x1*d + m;
@@ -1330,8 +1330,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
             const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
             const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-            const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
-            const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
 
             y[i*qk + j + 0   ] = x0*d;
             y[i*qk + j + qk/2] = x1*d;
@@ -1357,8 +1357,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
             const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
             const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-            const int x0 = (x[i].qs[j] & 0xf) | xh_0;
-            const int x1 = (x[i].qs[j] >>  4) | xh_1;
+            const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
+            const int x1 = (x[i].qs[j] >>   4) | xh_1;
 
             y[i*qk + j + 0   ] = x0*d + m;
             y[i*qk + j + qk/2] = x1*d + m;
@@ -2184,8 +2184,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         int sumi = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[i].qs[j] & 0xf) - 8;
-            const int v1 = (x[i].qs[j] >>  4) - 8;
+            const int v0 = (x[i].qs[j] & 0x0F) - 8;
+            const int v1 = (x[i].qs[j] >>   4) - 8;
 
             sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
         }
@@ -2306,8 +2306,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
         int sumi = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[i].qs[j] & 0xf);
-            const int v1 = (x[i].qs[j] >>  4);
+            const int v0 = (x[i].qs[j] & 0x0F);
+            const int v1 = (x[i].qs[j] >>   4);
 
             sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
         }
@@ -2487,8 +2487,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
             const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
             const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-            const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
-            const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
 
             sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
         }

From 83674556b8fbba86ec5a482ffac0154d11163526 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 7 May 2023 20:26:02 +0300
Subject: [PATCH 14/32] ggml : fix Q5_0 quantization

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 7ab747ca3a801..8ef279fd74b6a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -845,7 +845,7 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
         const float d  = max / -16;
         const float id = d ? 1.0f/d : 0.0f;
 
-        y[i].d = d;
+        y[i].d = GGML_FP32_TO_FP16(d);
 
         uint32_t qh = 0;
 

From 928d2f335f0e1998b702df3b68a7d670c839a63f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 8 May 2023 22:06:54 +0300
Subject: [PATCH 15/32] scripts : add script for measuring the time per token

---
 .gitignore              |  1 +
 README.md               | 24 +++++------
 llama.cpp               |  4 +-
 scripts/perf-run-all.sh | 93 +++++++++++++++++++++++++++++++++++++++++
 scripts/ppl-run-all.sh  |  4 --
 5 files changed, 108 insertions(+), 18 deletions(-)
 create mode 100755 scripts/perf-run-all.sh

diff --git a/.gitignore b/.gitignore
index a5fef327718f0..f5023e3042a81 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,5 +44,6 @@ zig-cache/
 
 ppl-*.txt
 qnt-*.txt
+perf-*.txt
 
 examples/jeopardy/results.txt
diff --git a/README.md b/README.md
index 045f995347154..75bd9faf57e5d 100644
--- a/README.md
+++ b/README.md
@@ -338,18 +338,18 @@ As the models are currently fully loaded into memory, you will need adequate dis
 
 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
 
-| Model | Measure      | F16    | Q4_0   | Q4_1   | Q4_2   | Q5_0   | Q5_1   | Q8_0   |
-|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
-|    7B | perplexity   | 5.9066 | 6.1620 | 6.0910 | 6.1466 | 5.9862 | 5.9481 | 5.9069 |
-|    7B | file size    |  13.0G |   4.0G |   4.8G |   4.0G |   4.4G |   4.8G |   7.1G |
-|    7B | ms/tok @ 4th |    128 |     56 |     61 |     84 |     91 |     95 |     75 |
-|    7B | ms/tok @ 8th |    128 |     47 |     55 |     48 |     53 |     59 |     75 |
-|    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |
-|   13B | perplexity   | 5.2543 | 5.3863 | 5.3607 | 5.3513 | 5.2856 | 5.2706 | 5.2548 |
-|   13B | file size    |  25.0G |   7.6G |   9.1G |   7.6G |   8.4G |   9.1G |    14G |
-|   13B | ms/tok @ 4th |    239 |    104 |    113 |    160 |    176 |    185 |    141 |
-|   13B | ms/tok @ 8th |    240 |     85 |     99 |     97 |    108 |    117 |    147 |
-|   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |
+| Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
+|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
+|    7B | perplexity   | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
+|    7B | file size    |  13.0G |   4.0G |   4.8G |   4.4G |   4.8G |   7.1G |
+|    7B | ms/tok @ 4th |    128 |     56 |     61 |     91 |     95 |     75 |
+|    7B | ms/tok @ 8th |    128 |     47 |     55 |     53 |     59 |     75 |
+|    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
+|   13B | perplexity   | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
+|   13B | file size    |  25.0G |   7.6G |   9.1G |   8.4G |   9.1G |    14G |
+|   13B | ms/tok @ 4th |    239 |    104 |    113 |    176 |    185 |    141 |
+|   13B | ms/tok @ 8th |    240 |     85 |     99 |    108 |    117 |    147 |
+|   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
 
 ### Perplexity (measuring model quality)
 
diff --git a/llama.cpp b/llama.cpp
index 5c6c3e72211fc..367522ac9051a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2809,9 +2809,9 @@ void llama_print_timings(struct llama_context * ctx) {
 
     fprintf(stderr, "\n");
     fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
-    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
+    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
     fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
-    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval);
+    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval);
     fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
 }
 
diff --git a/scripts/perf-run-all.sh b/scripts/perf-run-all.sh
new file mode 100755
index 0000000000000..7dbfc7c2044e1
--- /dev/null
+++ b/scripts/perf-run-all.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#
+# Measure the performance (time per token) of the various quantization techniques
+#
+
+QUANTIZE=0
+if [ "$1" != "" ]; then
+    echo "Quantizing"
+    QUANTIZE=1
+fi
+
+if [ "$QUANTIZE" != "0" ]; then
+    #
+    # quantize
+    #
+
+    # 7B
+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
+    time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
+
+    # 13B
+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
+    time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
+fi
+
+#
+# perf
+# run each command twice
+#
+
+set -x
+
+# 7B - 4 threads
+     ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-f16.txt  | grep llama_print_timings
+     ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings
+     ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings
+     ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings
+     ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings
+     ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings
+
+# 7B - 8 threads
+     ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-f16.txt  | grep llama_print_timings
+     ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings
+     ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings
+     ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings
+     ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings
+     ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings
+
+# 13B - 4 threads
+     ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-f16.txt  | grep llama_print_timings
+     ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings
+     ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings
+     ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings
+     ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings
+     ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings
+
+# 13B - 8 threads
+     ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-f16.bin  -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-f16.txt  | grep llama_print_timings
+     ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings
+     ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings
+     ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings
+     ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings
+     ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
+time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings
diff --git a/scripts/ppl-run-all.sh b/scripts/ppl-run-all.sh
index 28f31ca7130b7..c59e3075d26f2 100755
--- a/scripts/ppl-run-all.sh
+++ b/scripts/ppl-run-all.sh
@@ -7,7 +7,6 @@
 # 7B
 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
-time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-7b-q4_2.txt
 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
 time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
@@ -15,7 +14,6 @@ time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0
 # 13B
 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
-time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-13b-q4_2.txt
 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
 time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
@@ -28,7 +26,6 @@ time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8
 time ./bin/perplexity -m ../models/7B/ggml-model-f16.bin  -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-f16.txt
 time ./bin/perplexity -m ../models/7B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_0.txt
 time ./bin/perplexity -m ../models/7B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_1.txt
-time ./bin/perplexity -m ../models/7B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_2.txt
 time ./bin/perplexity -m ../models/7B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_0.txt
 time ./bin/perplexity -m ../models/7B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_1.txt
 time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q8_0.txt
@@ -37,7 +34,6 @@ time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --n
 time ./bin/perplexity -m ../models/13B/ggml-model-f16.bin  -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-f16.txt
 time ./bin/perplexity -m ../models/13B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_0.txt
 time ./bin/perplexity -m ../models/13B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_1.txt
-time ./bin/perplexity -m ../models/13B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_2.txt
 time ./bin/perplexity -m ../models/13B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_0.txt
 time ./bin/perplexity -m ../models/13B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_1.txt
 time ./bin/perplexity -m ../models/13B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q8_0.txt

From 9e49d20150fe30d1a2be1f39233ddbe15448c6ff Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Mon, 8 May 2023 19:14:06 +0000
Subject: [PATCH 16/32] AVX implementations (#1370)

---
 SHA256SUMS | 16 +++--------
 ggml.c     | 82 +++++++++++++++++++-----------------------------------
 2 files changed, 33 insertions(+), 65 deletions(-)

diff --git a/SHA256SUMS b/SHA256SUMS
index e487bdca6c9c2..9db08b597d0f9 100644
--- a/SHA256SUMS
+++ b/SHA256SUMS
@@ -1,24 +1,19 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
-99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6  models/7B/ggml-model-q4_0.bin
-cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe  models/7B/ggml-model-q4_1.bin
-25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496  models/7B/ggml-model-q4_2.bin
+ae89af479ab4d31c4e555ad8cc1dc9bf1f68d617186158cc381cd5a0fccd10bd  models/7B/ggml-model-q4_0.bin
+862072e2036a1bdb1a01ec2e159381f332a9e2357b886031c075fb7efa86db9b  models/7B/ggml-model-q4_1.bin
+0bef7cefa880a67a0b6d2a7e4559ded235823535ad616808dd8b5e47ff0a202f  models/7B/ggml-model-q5_0.bin
+97b9c38b2b8aed0c0aa90e0a975570ce3455c47d62128b382c55acbf6e2035f6  models/7B/ggml-model-q5_1.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
-eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab  models/13B/ggml-model-q4_0.bin
-d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb  models/13B/ggml-model-q4_1.bin
-75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa  models/13B/ggml-model-q4_2.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
-517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d  models/30B/ggml-model-q4_0.bin
-7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd  models/30B/ggml-model-q4_1.bin
-aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204  models/30B/ggml-model-q4_2.bin
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
@@ -29,8 +24,5 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/con
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
-01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2  models/65B/ggml-model-q4_0.bin
-4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f  models/65B/ggml-model-q4_1.bin
-1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9  models/65B/ggml-model-q4_2.bin
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
diff --git a/ggml.c b/ggml.c
index 8ef279fd74b6a..c0131805dd7cc 100644
--- a/ggml.c
+++ b/ggml.c
@@ -472,23 +472,16 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 //
 
 #if __AVX__ || __AVX2__ || __AVX512F__
-// Unpack 16 4-bit fields into 16 bytes
-// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval
-static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
-{
-    // Load 8 bytes from memory
-    __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
-
-    // Expand bytes into uint16_t values
-    __m128i bytes = _mm_cvtepu8_epi16( tmp );
-
-    // Unpack values into individual bytes
-    const __m128i lowMask = _mm_set1_epi8( 0xF );
-    __m128i high = _mm_andnot_si128( lowMask, bytes );
-    __m128i low = _mm_and_si128( lowMask, bytes );
-    high = _mm_slli_epi16( high, 4 );
-    bytes = _mm_or_si128( low, high );
-    return bytes;
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = _mm_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = _mm_sign_epi8(y, x);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = _mm_maddubs_epi16(ax, sy);
+    const __m128i ones = _mm_set1_epi16(1);
+    return _mm_madd_epi16(ones, dot);
 }
 
 // horizontally add 8 floats
@@ -535,19 +528,10 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
 {
-    // Load 16 bytes from memory
-    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
-
-    // Expand bytes into uint16_t values
-    __m256i bytes = _mm256_cvtepu8_epi16( tmp );
-
-    // Unpack values into individual bytes
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
     const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    __m256i high = _mm256_andnot_si256( lowMask, bytes );
-    __m256i low = _mm256_and_si256( lowMask, bytes );
-    high = _mm256_slli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-    return bytes;
+    return _mm256_and_si256(lowMask, bytes);
 }
 
 // add int16_t pairwise and return as float vector
@@ -2146,31 +2130,23 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         // Compute combined scale for the block
         const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
 
-        __m128i i32[2];
-        for (int j = 0; j < 2; ++j) {
-            // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
-            __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j);
-            __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j));
-
-            // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-            const __m128i off = _mm_set1_epi8( 8 );
-            bx = _mm_sub_epi8( bx, off );
+        const __m128i lowMask = _mm_set1_epi8(0xF);
+        const __m128i off = _mm_set1_epi8(8);
 
-            // Get absolute values of x vectors
-            const __m128i ax = _mm_sign_epi8(bx, bx);
+        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
 
-            // Sign the values of the y vectors
-            const __m128i sy = _mm_sign_epi8(by, bx);
+        __m128i bx = _mm_and_si128(lowMask, tmp);
+        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
 
-            // Perform multiplication and create 16-bit values
-            const __m128i dot = _mm_maddubs_epi16(ax, sy);
-
-            const __m128i ones = _mm_set1_epi16(1);
-            i32[j] = _mm_madd_epi16(ones, dot);
-        }
+        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
+        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
 
         // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
+        __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
         // Apply the scale, and accumulate
         acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
     }
@@ -2484,8 +2460,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         int sumi = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
 
             const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
             const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
@@ -2673,8 +2649,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         int sumi = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
 
             const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
             const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;

From 489bd13fadc34218166d71ee9b39712a292647de Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 8 May 2023 22:18:15 +0300
Subject: [PATCH 17/32] ggml : uniform 5th bit extraction

---
 ggml-cuda.cu |  8 ++++----
 ggml.c       | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 46f7b568c608b..f11d4dc23ddbc 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -123,8 +123,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
     memcpy(&qh, x[i].qh, sizeof(qh));
 
     for (int j = 0; j < qk/2; ++j) {
-        const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-        const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
 
         const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
         const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
@@ -148,8 +148,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
     memcpy(&qh, x[i].qh, sizeof(qh));
 
     for (int j = 0; j < qk/2; ++j) {
-        const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-        const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
 
         const int x0 = (x[i].qs[j] & 0xf) | xh_0;
         const int x1 = (x[i].qs[j] >>  4) | xh_1;
diff --git a/ggml.c b/ggml.c
index c0131805dd7cc..4335b10d0c387 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1311,8 +1311,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
         memcpy(&qh, x[i].qh, sizeof(qh));
 
         for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
 
             const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
             const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
@@ -1338,8 +1338,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
         memcpy(&qh, x[i].qh, sizeof(qh));
 
         for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
 
             const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
             const int x1 = (x[i].qs[j] >>   4) | xh_1;
@@ -12090,8 +12090,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
             memcpy(&qh, &y[i].qh, sizeof(qh));
 
             for (int j = 0; j < QK5_0; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j + 1))) >> (j + 1)) << 4;
+                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -12120,8 +12120,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
             memcpy(&qh, &y[i].qh, sizeof(qh));
 
             for (int j = 0; j < QK5_1; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j + 1))) >> (j + 1)) << 4;
+                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;

From d52172a509740daa9bffa81bb8c4de02dc1634e5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 9 May 2023 18:19:13 +0300
Subject: [PATCH 18/32] llama : produce error upon loading old model files

---
 llama.cpp | 15 +++++++++++++--
 llama.h   |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 367522ac9051a..334d4e1bc260d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -402,6 +402,7 @@ enum llama_file_version {
     LLAMA_FILE_VERSION_GGML,
     LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
     LLAMA_FILE_VERSION_GGJT_V1, // added padding
+    LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
 };
 
 struct llama_file_loader {
@@ -432,6 +433,8 @@ struct llama_file_loader {
             file_version = LLAMA_FILE_VERSION_GGMF_V1;
         } else if (magic == 'ggjt' && version == 1) {
             file_version = LLAMA_FILE_VERSION_GGJT_V1;
+        } else if (magic == 'ggjt' && version == 2) {
+            file_version = LLAMA_FILE_VERSION_GGJT_V2;
         } else {
             throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
                          magic, version);
@@ -837,8 +840,8 @@ static const char *llama_file_version_name(llama_file_version version) {
     switch (version) {
         case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
         case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
-        case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
-        default: LLAMA_ASSERT(false);
+        case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1305)";
+        case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
     }
 }
 
@@ -915,6 +918,14 @@ static void llama_model_load_internal(
         fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
     }
 
+    if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
+        if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
+            hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
+            hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
+            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
+        }
+    }
+
     if (vocab_only) {
         return;
     }
diff --git a/llama.h b/llama.h
index fea5ffeffbee0..1a65cd5892389 100644
--- a/llama.h
+++ b/llama.h
@@ -19,7 +19,7 @@
 #    define LLAMA_API
 #endif
 
-#define LLAMA_FILE_VERSION           1
+#define LLAMA_FILE_VERSION           2
 #define LLAMA_FILE_MAGIC             'ggjt'
 #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
 #define LLAMA_SESSION_MAGIC          'ggsn'

From 09032e0290b9393210e1280c05fb03b39e6d57ee Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 9 May 2023 18:25:28 +0300
Subject: [PATCH 19/32] llama : fix model magic/version write

---
 llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 334d4e1bc260d..1ba9a62b8cb50 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -529,8 +529,8 @@ struct llama_file_saver {
         write_vocab();
     }
     void write_magic() {
-        file.write_u32('ggjt'); // magic
-        file.write_u32(1); // version
+        file.write_u32(LLAMA_FILE_MAGIC);   // magic
+        file.write_u32(LLAMA_FILE_VERSION); // version
     }
     void write_hparams(enum llama_ftype new_ftype) {
         const llama_hparams & hparams = any_file_loader->hparams;

From b7ad385d42f09d640e334283ebac46df3813a7b6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 10 May 2023 22:58:45 +0300
Subject: [PATCH 20/32] ggml : speed-up Q5_0 + Q5_1 at 4 threads

---
 ggml.c | 223 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 147 insertions(+), 76 deletions(-)

diff --git a/ggml.c b/ggml.c
index 4335b10d0c387..6efd51b864044 100644
--- a/ggml.c
+++ b/ggml.c
@@ -339,8 +339,9 @@ static float table_f32_f16[1 << 16];
 #define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
 #define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
 
-// precomputed tables for expanding 8bits to 8 bytes (shl 4)
-static const uint64_t table_b2b_u[1 << 8] = { B8(00, 10) };
+// precomputed tables for expanding 8bits to 8 bytes:
+static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
+static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif
 
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
@@ -2307,68 +2308,102 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
     const block_q8_0 * restrict y = vy;
 
 #if defined(__ARM_NEON)
-    float32x4_t sumv = vdupq_n_f32(0.0f);
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
-    uint64_t tmp[4];
+    uint32_t qh0;
+    uint32_t qh1;
 
-    for (int i = 0; i < nb; ++i) {
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    for (int i = 0; i < nb; i += 2) {
         const block_q5_0 * restrict x0 = &x[i];
+        const block_q5_0 * restrict x1 = &x[i + 1];
         const block_q8_0 * restrict y0 = &y[i];
+        const block_q8_0 * restrict y1 = &y[i + 1];
 
-        const uint8x16_t m4b  = vdupq_n_u8(0x0F);
-        const int8x16_t  s16b = vdupq_n_s8(0x10);
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
-        // extract the 5th bit
-        uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
+        // extract the 5th bit via lookup table ((!b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
 
-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
+        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
 
-        const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
-        const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
+        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
 
-        const uint8x16_t v0 = vld1q_u8(x0->qs);
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
 
         // 4-bit -> 8-bit
-        const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8  (v0, m4b));
-        const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
+        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
 
-        // add high bit and sub 16
-        const int8x16_t v0lf = vsubq_s8(vorrq_s8(v0l, qhl), s16b);
-        const int8x16_t v0hf = vsubq_s8(vorrq_s8(v0h, qhh), s16b);
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
 
         // load y
-        const int8x16_t v1l = vld1q_s8(y0->qs);
-        const int8x16_t v1h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
 
         const float x0d = GGML_FP16_TO_FP32(x0->d);
+        const float x1d = GGML_FP16_TO_FP32(x1->d);
 
 #if defined(__ARM_FEATURE_DOTPROD)
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0lf, v1l),
-                        vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d);
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
 #else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h));
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
 
         const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
         const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
 
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
 #endif
     }
 
-    *s = vaddvq_f32(sumv);
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
 #elif defined(__wasm_simd128__)
     v128_t sumv = wasm_f32x4_splat(0.0f);
 
+    uint32_t qh;
     uint64_t tmp[4];
 
+    // TODO: check if unrolling this is better
     for (int i = 0; i < nb; ++i) {
         const block_q5_0 * restrict x0 = &x[i];
         const block_q8_0 * restrict y0 = &y[i];
@@ -2377,13 +2412,12 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         const v128_t s16b = wasm_i8x16_splat(0x10);
 
         // extract the 5th bit
-        uint32_t qh;
         memcpy(&qh, x0->qh, sizeof(qh));
 
-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
+        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_1[(qh >> 24)       ];
 
         const v128_t qhl = wasm_v128_load(tmp + 0);
         const v128_t qhh = wasm_v128_load(tmp + 2);
@@ -2395,8 +2429,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         const v128_t v0h = wasm_u8x16_shr(v0, 4);
 
         // add high bit and sub 16
-        const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0l, qhl), s16b);
-        const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0h, qhh), s16b);
+        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
+        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
 
         // load y
         const v128_t v1l = wasm_v128_load(y0->qs);
@@ -2488,69 +2522,107 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
     const block_q8_1 * restrict y = vy;
 
 #if defined(__ARM_NEON)
-    float32x4_t sumv = vdupq_n_f32(0.0f);
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
-    float summs = 0.0f;
+    float summs0 = 0.0f;
+    float summs1 = 0.0f;
 
-    uint64_t tmp[4];
+    uint32_t qh0;
+    uint32_t qh1;
 
-    for (int i = 0; i < nb; ++i) {
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    for (int i = 0; i < nb; i += 2) {
         const block_q5_1 * restrict x0 = &x[i];
+        const block_q5_1 * restrict x1 = &x[i + 1];
         const block_q8_1 * restrict y0 = &y[i];
+        const block_q8_1 * restrict y1 = &y[i + 1];
 
-        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
-        // extract the 5th bit
-        uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
+        summs0 += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
+        summs1 += GGML_FP16_TO_FP32(x1->m) * (y1->s0 + y1->s1);
+
+        // extract the 5th bit via lookup table ((b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
 
-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
+        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
 
-        const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
-        const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
+        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
 
-        const uint8x16_t v0 = vld1q_u8(x0->qs);
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
 
         // 4-bit -> 8-bit
-        const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8  (v0, vdupq_n_u8(0x0F)));
-        const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
 
-        // add
-        const int8x16_t v0lf = vorrq_s8(v0l, qhl);
-        const int8x16_t v0hf = vorrq_s8(v0h, qhh);
+        // add 5th bit
+        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
 
         // load y
-        const int8x16_t v1l = vld1q_s8(y0->qs);
-        const int8x16_t v1h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
 
         const float x0d = GGML_FP16_TO_FP32(x0->d);
+        const float x1d = GGML_FP16_TO_FP32(x1->d);
 
 #if defined(__ARM_FEATURE_DOTPROD)
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0lf, v1l),
-                        vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d);
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
 #else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h));
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
 
         const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
         const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
 
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
 #endif
     }
 
-    *s = vaddvq_f32(sumv) + summs;
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
 #elif defined(__wasm_simd128__)
     v128_t sumv = wasm_f32x4_splat(0.0f);
 
     float summs = 0.0f;
 
+    uint32_t qh;
     uint64_t tmp[4];
 
     for (int i = 0; i < nb; ++i) {
@@ -2562,13 +2634,12 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         const v128_t m4b = wasm_i8x16_splat(0x0F);
 
         // extract the 5th bit
-        uint32_t qh;
         memcpy(&qh, x0->qh, sizeof(qh));
 
-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];
 
         const v128_t qhl = wasm_v128_load(tmp + 0);
         const v128_t qhh = wasm_v128_load(tmp + 2);

From 695f3963b180c90a571c75e8e14d1cf1365632cc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 May 2023 19:46:11 +0300
Subject: [PATCH 21/32] ggml : preserve old Q4 and Q5 formats

---
 ggml-cuda.cu  | 27 ++++++++-------
 ggml-opencl.c |  1 +
 ggml.c        | 91 ++++++++++++++++++++++++++-------------------------
 llama.cpp     |  4 +--
 4 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index f11d4dc23ddbc..08d1566bdd880 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -86,8 +86,8 @@ static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf) - 8;
         const int x1 = (x[i].qs[j] >>  4) - 8;
 
-        y[i*qk + j + 0   ] = x0*d;
-        y[i*qk + j + qk/2] = x1*d;
+        y[i*qk + 2*j + 0] = x0*d;
+        y[i*qk + 2*j + 1] = x1*d;
     }
 }
 
@@ -105,8 +105,8 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf);
         const int x1 = (x[i].qs[j] >>  4);
 
-        y[i*qk + j + 0   ] = x0*d + m;
-        y[i*qk + j + qk/2] = x1*d + m;
+        y[i*qk + 2*j + 0] = x0*d + m;
+        y[i*qk + 2*j + 1] = x1*d + m;
     }
 }
 
@@ -129,8 +129,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
         const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
         const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
 
-        y[i*qk + j + 0   ] = x0*d;
-        y[i*qk + j + qk/2] = x1*d;
+        y[i*qk + 2*j + 0] = x0*d;
+        y[i*qk + 2*j + 1] = x1*d;
     }
 }
 
@@ -154,24 +154,23 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf) | xh_0;
         const int x1 = (x[i].qs[j] >>  4) | xh_1;
 
-        y[i*qk + j + 0   ] = x0*d + m;
-        y[i*qk + j + qk/2] = x1*d + m;
+        y[i*qk + 2*j + 0] = x0*d + m;
+        y[i*qk + 2*j + 1] = x1*d + m;
     }
 }
 
 static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
+    static const int qk = QK8_0;
+
     const block_q8_0 * x = (const block_q8_0 *) vx;
 
     const int i = blockIdx.x;
 
     const float d = x[i].d;
 
-    const int8_t * pp = x[i].qs;
-
-    for (int l = 0; l < QK8_0; l++) {
-        const int8_t vi = pp[l];
-
-        y[i*QK8_0 + l] = vi*d;
+    for (int j = 0; j < qk/2; ++j) {
+        y[i*qk + 2*j + 0] = x[i].qs[j + 0   ]*d;
+        y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d;
     }
 }
 
diff --git a/ggml-opencl.c b/ggml-opencl.c
index 0e6e6770f6307..230c84f2fb411 100644
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -114,6 +114,7 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
     const uint i = get_global_id(0) / 32;
     const uint l = get_local_id(0);
 
+    // TODO: this is broken
     result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
 }
 
diff --git a/ggml.c b/ggml.c
index 6efd51b864044..21c297e5a0cc6 100644
--- a/ggml.c
+++ b/ggml.c
@@ -751,8 +751,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
         y[i].d = d;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
+            const float x0 = x[i*qk + 2*j + 0]*id;
+            const float x1 = x[i*qk + 2*j + 1]*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
@@ -792,8 +792,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
         y[i].m = min;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+            const float x0 = (x[i*qk + 2*j + 0] - min)*id;
+            const float x1 = (x[i*qk + 2*j + 1] - min)*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
@@ -835,8 +835,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
+            const float x0 = x[i*qk + 2*j + 0]*id;
+            const float x1 = x[i*qk + 2*j + 1]*id;
 
             const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
             const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
@@ -883,8 +883,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+            const float x0 = (x[i*qk + 2*j + 0] - min)*id;
+            const float x1 = (x[i*qk + 2*j + 1] - min)*id;
 
             const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
             const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
@@ -922,10 +922,12 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
 
         y[i].d = d;
 
-        for (int j = 0; j < QK8_0; ++j) {
-            const float v0 = x[i*QK8_0 + j]*id;
+        for (int j = 0; j < QK8_0/2; ++j) {
+            const float v0 = x[i*QK8_0 + 2*j + 0]*id;
+            const float v1 = x[i*QK8_0 + 2*j + 1]*id;
 
-            y[i].qs[j] = roundf(v0);
+            y[i].qs[          j] = v0 + 0.5f;
+            y[i].qs[QK8_0/2 + j] = v1 + 0.5f;
         }
     }
 }
@@ -943,12 +945,12 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         float32x4_t asrcv[8];
         float32x4_t amaxv[8];
 
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
 
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
 
         const float amax = vmaxvq_f32(amaxv[0]);
 
@@ -957,14 +959,14 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
 
         y[i].d = d;
 
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
+            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
+            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
         }
     }
 #elif defined(__AVX2__) || defined(__AVX__)
@@ -1080,11 +1082,11 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
         int sum1 = 0;
 
         for (int j = 0; j < QK8_1/2; ++j) {
-            const float v0 = x[i*QK8_1           + j]*id;
-            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
+            const float v0 = x[i*QK8_1 + 2*j + 0]*id;
+            const float v1 = x[i*QK8_1 + 2*j + 1]*id;
 
-            y[i].qs[          j] = roundf(v0);
-            y[i].qs[QK8_1/2 + j] = roundf(v1);
+            y[i].qs[          j] = v0 + 0.5f;
+            y[i].qs[QK8_1/2 + j] = v1 + 0.5f;
 
             sum0 += y[i].qs[          j];
             sum1 += y[i].qs[QK8_1/2 + j];
@@ -1129,10 +1131,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
+            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
 
             accv0 = vaddq_s32(accv0, vi);
         }
@@ -1142,10 +1144,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
+            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
 
             accv1 = vaddq_s32(accv1, vi);
         }
@@ -1271,8 +1273,8 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F) - 8;
             const int x1 = (x[i].qs[j] >>   4) - 8;
 
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
+            y[i*qk + 2*j + 0] = x0*d;
+            y[i*qk + 2*j + 1] = x1*d;
         }
     }
 }
@@ -1292,8 +1294,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F);
             const int x1 = (x[i].qs[j] >>   4);
 
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
+            y[i*qk + 2*j + 0] = x0*d + m;
+            y[i*qk + 2*j + 1] = x1*d + m;
         }
     }
 }
@@ -1318,8 +1320,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
             const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
             const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
 
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
+            y[i*qk + 2*j + 0] = x0*d;
+            y[i*qk + 2*j + 1] = x1*d;
         }
     }
 }
@@ -1345,8 +1347,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
             const int x1 = (x[i].qs[j] >>   4) | xh_1;
 
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
+            y[i*qk + 2*j + 0] = x0*d + m;
+            y[i*qk + 2*j + 1] = x1*d + m;
         }
     }
 }
@@ -1363,8 +1365,9 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
 
-        for (int j = 0; j < qk; ++j) {
-            y[i*qk + j] = x[i].qs[j]*d;
+        for (int j = 0; j < qk/2; ++j) {
+            y[i*qk + 2*j + 0] = x[i].qs[j + 0   ]*d;
+            y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d;
         }
     }
 }
diff --git a/llama.cpp b/llama.cpp
index 1ba9a62b8cb50..be9f8fffb8084 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -919,9 +919,7 @@ static void llama_model_load_internal(
     }
 
     if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
-        if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
-            hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
-            hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
+        if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
             throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
         }
     }

From 582a39fff532d2d57d5a41106a7f3a3309e49765 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 May 2023 20:11:37 +0300
Subject: [PATCH 22/32] ggml : simplify Q8_1 - no need for low / high sums
 anymore

---
 ggml.c | 69 +++++++++++++++++++---------------------------------------
 1 file changed, 22 insertions(+), 47 deletions(-)

diff --git a/ggml.c b/ggml.c
index 21c297e5a0cc6..619ce33d843e3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -718,12 +718,11 @@ static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block siz
 
 #define QK8_1 32
 typedef struct {
-    float   d;          // delta
-    float   s0;         // d * sum(qs[i]) low
-    float   s1;         // d * sum(qs[i]) high
-    int8_t  qs[QK8_1];  // quants
+    float   d;         // delta
+    float   s;         // d * sum(qs[i])
+    int8_t  qs[QK8_1]; // quants
 } block_q8_1;
-static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
+static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
 
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
@@ -1078,8 +1077,7 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
 
         y[i].d = d;
 
-        int sum0 = 0;
-        int sum1 = 0;
+        int sum = 0;
 
         for (int j = 0; j < QK8_1/2; ++j) {
             const float v0 = x[i*QK8_1 + 2*j + 0]*id;
@@ -1088,12 +1086,11 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
             y[i].qs[          j] = v0 + 0.5f;
             y[i].qs[QK8_1/2 + j] = v1 + 0.5f;
 
-            sum0 += y[i].qs[          j];
-            sum1 += y[i].qs[QK8_1/2 + j];
+            sum += y[i].qs[          j];
+            sum += y[i].qs[QK8_1/2 + j];
         }
 
-        y[i].s0 = d * sum0;
-        y[i].s1 = d * sum1;
+        y[i].s = d * sum;
     }
 }
 
@@ -1123,24 +1120,9 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
 
         y[i].d = d;
 
-        int32x4_t accv0 = vdupq_n_s32(0);
-        int32x4_t accv1 = vdupq_n_s32(0);
-
-        // low half
-        for (int j = 0; j < 4; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
+        int32x4_t accv = vdupq_n_s32(0);
 
-            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
-            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
-
-            accv0 = vaddq_s32(accv0, vi);
-        }
-
-        // high half
-        for (int j = 4; j < 8; j++) {
+        for (int j = 0; j < 8; j++) {
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
@@ -1149,14 +1131,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
             y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
             y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
 
-            accv1 = vaddq_s32(accv1, vi);
+            accv = vaddq_s32(accv, vi);
         }
 
-        const int32_t sum0 = vaddvq_s32(accv0);
-        const int32_t sum1 = vaddvq_s32(accv1);
-
-        y[i].s0 = d * sum0;
-        y[i].s1 = d * sum1;
+        y[i].s = d * vaddvq_s32(accv);
     }
 #elif defined(__AVX2__) || defined(__AVX__)
     for (int i = 0; i < nb; i++) {
@@ -1205,9 +1183,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
 
 #if defined(__AVX2__)
         // Compute the sum of the quants and set y[i].s
-        //y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
-        y[i].s0 = d * hsum_i32_8(_mm256_add_epi32(i0, i1));
-        y[i].s1 = d * hsum_i32_8(_mm256_add_epi32(i2, i3));
+        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
 
         // Convert int32 to int16
         i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
@@ -1237,8 +1213,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         // Compute the sum of the quants and set y[i].s
         const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
         const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s0 = d * hsum_i32_4(s0);
-        y[i].s1 = d * hsum_i32_4(s1);
+        y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
 
         // Convert int32 to int16
         ni0 = _mm_packs_epi32( ni0, ni1 );
@@ -2200,7 +2175,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
         const block_q8_1 * restrict y0 = &y[i + 0];
         const block_q8_1 * restrict y1 = &y[i + 1];
 
-        summs += x0->m * (y0->s0 + y0->s1) + x1->m * (y1->s0 + y1->s1);
+        summs += x0->m * y0->s + x1->m * y1->s;
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
@@ -2259,7 +2234,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
         const float * d0 = &x[i].d;
         const float * d1 = &y[i].d;
 
-        summs += x[i].m * (y[i].s0 + y[i].s1);
+        summs += x[i].m * y[i].s;
 
         const __m256 d0v = _mm256_broadcast_ss( d0 );
         const __m256 d1v = _mm256_broadcast_ss( d1 );
@@ -2292,7 +2267,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
             sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
         }
 
-        sumf += (x[i].d*y[i].d)*sumi + x[i].m*(y[i].s0 + y[i].s1);
+        sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s;
     }
 
     *s = sumf;
@@ -2545,8 +2520,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
-        summs0 += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
-        summs1 += GGML_FP16_TO_FP32(x1->m) * (y1->s0 + y1->s1);
+        summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
+        summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
 
         // extract the 5th bit via lookup table ((b) << 4)
         memcpy(&qh0, x0->qh, sizeof(qh0));
@@ -2632,7 +2607,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         const block_q5_1 * restrict x0 = &x[i];
         const block_q8_1 * restrict y0 = &y[i];
 
-        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
+        summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
 
         const v128_t m4b = wasm_i8x16_splat(0x0F);
 
@@ -2696,7 +2671,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
     for (int i = 0; i < nb; i++) {
         const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
 
-        summs += GGML_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1);
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
 
         __m256i bx = bytes_from_nibbles_32(x[i].qs);
         __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -2732,7 +2707,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
             sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
         }
 
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*(y[i].s0 + y[i].s1);
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
     }
 
     *s = sumf;

From 6680244838dbddcd20a08eed39398520573c94fd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 May 2023 20:47:41 +0300
Subject: [PATCH 23/32] ggml : fix Q8_0 and Q8_1 rounding

---
 ggml.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml.c b/ggml.c
index 619ce33d843e3..a39ae74b13291 100644
--- a/ggml.c
+++ b/ggml.c
@@ -925,8 +925,8 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
             const float v0 = x[i*QK8_0 + 2*j + 0]*id;
             const float v1 = x[i*QK8_0 + 2*j + 1]*id;
 
-            y[i].qs[          j] = v0 + 0.5f;
-            y[i].qs[QK8_0/2 + j] = v1 + 0.5f;
+            y[i].qs[          j] = roundf(v0);
+            y[i].qs[QK8_0/2 + j] = roundf(v1);
         }
     }
 }
@@ -1083,8 +1083,8 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
             const float v0 = x[i*QK8_1 + 2*j + 0]*id;
             const float v1 = x[i*QK8_1 + 2*j + 1]*id;
 
-            y[i].qs[          j] = v0 + 0.5f;
-            y[i].qs[QK8_1/2 + j] = v1 + 0.5f;
+            y[i].qs[          j] = roundf(v0);
+            y[i].qs[QK8_1/2 + j] = roundf(v1);
 
             sum += y[i].qs[          j];
             sum += y[i].qs[QK8_1/2 + j];

From bd5e373058de91d6f1dd8e58c108dea96fd86ec4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 May 2023 20:57:28 +0300
Subject: [PATCH 24/32] Revert "AVX implementations (#1370)"

This reverts commit 948d124837f9d287d8490f41338e0e4cceb0814f.
---
 SHA256SUMS | 16 ++++++++---
 ggml.c     | 82 +++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 65 insertions(+), 33 deletions(-)

diff --git a/SHA256SUMS b/SHA256SUMS
index 9db08b597d0f9..e487bdca6c9c2 100644
--- a/SHA256SUMS
+++ b/SHA256SUMS
@@ -1,19 +1,24 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
-ae89af479ab4d31c4e555ad8cc1dc9bf1f68d617186158cc381cd5a0fccd10bd  models/7B/ggml-model-q4_0.bin
-862072e2036a1bdb1a01ec2e159381f332a9e2357b886031c075fb7efa86db9b  models/7B/ggml-model-q4_1.bin
-0bef7cefa880a67a0b6d2a7e4559ded235823535ad616808dd8b5e47ff0a202f  models/7B/ggml-model-q5_0.bin
-97b9c38b2b8aed0c0aa90e0a975570ce3455c47d62128b382c55acbf6e2035f6  models/7B/ggml-model-q5_1.bin
+99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6  models/7B/ggml-model-q4_0.bin
+cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe  models/7B/ggml-model-q4_1.bin
+25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496  models/7B/ggml-model-q4_2.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
+eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab  models/13B/ggml-model-q4_0.bin
+d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb  models/13B/ggml-model-q4_1.bin
+75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa  models/13B/ggml-model-q4_2.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
+517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d  models/30B/ggml-model-q4_0.bin
+7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd  models/30B/ggml-model-q4_1.bin
+aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204  models/30B/ggml-model-q4_2.bin
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
@@ -24,5 +29,8 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/con
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
+01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2  models/65B/ggml-model-q4_0.bin
+4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f  models/65B/ggml-model-q4_1.bin
+1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9  models/65B/ggml-model-q4_2.bin
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
diff --git a/ggml.c b/ggml.c
index a39ae74b13291..30485d113b1ba 100644
--- a/ggml.c
+++ b/ggml.c
@@ -473,16 +473,23 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 //
 
 #if __AVX__ || __AVX2__ || __AVX512F__
-// multiply int8_t, add results pairwise twice
-static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
-    // Get absolute values of x vectors
-    const __m128i ax = _mm_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m128i sy = _mm_sign_epi8(y, x);
-    // Perform multiplication and create 16-bit values
-    const __m128i dot = _mm_maddubs_epi16(ax, sy);
-    const __m128i ones = _mm_set1_epi16(1);
-    return _mm_madd_epi16(ones, dot);
+// Unpack 16 4-bit fields into 16 bytes
+// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval
+static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
+{
+    // Load 8 bytes from memory
+    __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
+
+    // Expand bytes into uint16_t values
+    __m128i bytes = _mm_cvtepu8_epi16( tmp );
+
+    // Unpack values into individual bytes
+    const __m128i lowMask = _mm_set1_epi8( 0xF );
+    __m128i high = _mm_andnot_si128( lowMask, bytes );
+    __m128i low = _mm_and_si128( lowMask, bytes );
+    high = _mm_slli_epi16( high, 4 );
+    bytes = _mm_or_si128( low, high );
+    return bytes;
 }
 
 // horizontally add 8 floats
@@ -529,10 +536,19 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
 {
-    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
-    const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
+    // Load 16 bytes from memory
+    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
+
+    // Expand bytes into uint16_t values
+    __m256i bytes = _mm256_cvtepu8_epi16( tmp );
+
+    // Unpack values into individual bytes
     const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    return _mm256_and_si256(lowMask, bytes);
+    __m256i high = _mm256_andnot_si256( lowMask, bytes );
+    __m256i low = _mm256_and_si256( lowMask, bytes );
+    high = _mm256_slli_epi16( high, 4 );
+    bytes = _mm256_or_si256( low, high );
+    return bytes;
 }
 
 // add int16_t pairwise and return as float vector
@@ -2109,23 +2125,31 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         // Compute combined scale for the block
         const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
 
-        const __m128i lowMask = _mm_set1_epi8(0xF);
-        const __m128i off = _mm_set1_epi8(8);
+        __m128i i32[2];
+        for (int j = 0; j < 2; ++j) {
+            // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
+            __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j);
+            __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j));
+
+            // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+            const __m128i off = _mm_set1_epi8( 8 );
+            bx = _mm_sub_epi8( bx, off );
 
-        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
+            // Get absolute values of x vectors
+            const __m128i ax = _mm_sign_epi8(bx, bx);
 
-        __m128i bx = _mm_and_si128(lowMask, tmp);
-        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
-        bx = _mm_sub_epi8(bx, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
+            // Sign the values of the y vectors
+            const __m128i sy = _mm_sign_epi8(by, bx);
 
-        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
-        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
-        bx = _mm_sub_epi8(bx, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
+            // Perform multiplication and create 16-bit values
+            const __m128i dot = _mm_maddubs_epi16(ax, sy);
+
+            const __m128i ones = _mm_set1_epi16(1);
+            i32[j] = _mm_madd_epi16(ones, dot);
+        }
 
         // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
+        __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
         // Apply the scale, and accumulate
         acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
     }
@@ -2472,8 +2496,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         int sumi = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
             const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
             const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
@@ -2698,8 +2722,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         int sumi = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
             const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
             const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;

From 5bc286ab18e20db719cc0959bab3803f119d404c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 May 2023 21:22:27 +0300
Subject: [PATCH 25/32] ggml : fix AVX2 implementation

---
 ggml.c | 106 +++++++++++++++++++++++++++------------------------------
 1 file changed, 51 insertions(+), 55 deletions(-)

diff --git a/ggml.c b/ggml.c
index 30485d113b1ba..f4e626433ee1f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -473,23 +473,16 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 //
 
 #if __AVX__ || __AVX2__ || __AVX512F__
-// Unpack 16 4-bit fields into 16 bytes
-// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval
-static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
-{
-    // Load 8 bytes from memory
-    __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
-
-    // Expand bytes into uint16_t values
-    __m128i bytes = _mm_cvtepu8_epi16( tmp );
-
-    // Unpack values into individual bytes
-    const __m128i lowMask = _mm_set1_epi8( 0xF );
-    __m128i high = _mm_andnot_si128( lowMask, bytes );
-    __m128i low = _mm_and_si128( lowMask, bytes );
-    high = _mm_slli_epi16( high, 4 );
-    bytes = _mm_or_si128( low, high );
-    return bytes;
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+    // Get absolute values of x vectors
+    const __m128i ax = _mm_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m128i sy = _mm_sign_epi8(y, x);
+    // Perform multiplication and create 16-bit values
+    const __m128i dot = _mm_maddubs_epi16(ax, sy);
+    const __m128i ones = _mm_set1_epi16(1);
+    return _mm_madd_epi16(ones, dot);
 }
 
 // horizontally add 8 floats
@@ -524,14 +517,21 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
     uint32_t x32;
     memcpy(&x32, x, sizeof(uint32_t));
     const __m256i shuf_mask = _mm256_set_epi64x(
-        0x0303030303030303, 0x0202020202020202,
-        0x0101010101010101, 0x0000000000000000);
+            0x0303030303030303, 0x0202020202020202,
+            0x0101010101010101, 0x0000000000000000);
     __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
     const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
     bytes = _mm256_or_si256(bytes, bit_mask);
     return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
 }
 
+static inline __m256i bytes_from_nibbles_32_deinterleave(const uint8_t * rsi) {
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
+    const __m256i lowMask = _mm256_set1_epi8( 0xF );
+    return _mm256_and_si256(lowMask, bytes);
+}
+
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
@@ -984,7 +984,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
             y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
         }
     }
-#elif defined(__AVX2__) || defined(__AVX__)
+#elif defined(__AVX2__)
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -1029,7 +1029,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         __m256i i2 = _mm256_cvtps_epi32( v2 );
         __m256i i3 = _mm256_cvtps_epi32( v3 );
 
-#if defined(__AVX2__)
+#if defined(__AVX2__) // || defined(__AVX__) TODO
         // Convert int32 to int16
         i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
         i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
@@ -1037,10 +1037,11 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 
         // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+        // TODO: find a smarter way to do this
+        i2 = _mm256_permute2f128_si256(i0, i0, 0x01);
+        i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15));
+        i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1));
+        i0 = _mm256_or_si256(i1, i2);
 
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else
@@ -1152,7 +1153,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
 
         y[i].s = d * vaddvq_s32(accv);
     }
-#elif defined(__AVX2__) || defined(__AVX__)
+#elif defined(__AVX2__)
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -1197,7 +1198,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         __m256i i2 = _mm256_cvtps_epi32( v2 );
         __m256i i3 = _mm256_cvtps_epi32( v3 );
 
-#if defined(__AVX2__)
+#if defined(__AVX2__) // || defined(__AVX__) TODO
         // Compute the sum of the quants and set y[i].s
         y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
 
@@ -1208,10 +1209,11 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 
         // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+        // TODO: find a smarter way to do this
+        i2 = _mm256_permute2f128_si256(i0, i0, 0x01);
+        i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15));
+        i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1));
+        i0 = _mm256_or_si256(i1, i2);
 
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else
@@ -2101,7 +2103,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         /* Compute combined scale for the block */
         const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
 
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
 
         // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
         const __m256i off = _mm256_set1_epi8( 8 );
@@ -2125,31 +2127,24 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         // Compute combined scale for the block
         const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
 
-        __m128i i32[2];
-        for (int j = 0; j < 2; ++j) {
-            // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
-            __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j);
-            __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j));
+        const __m128i lowMask = _mm_set1_epi8(0xF);
+        const __m128i off = _mm_set1_epi8(8);
 
-            // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-            const __m128i off = _mm_set1_epi8( 8 );
-            bx = _mm_sub_epi8( bx, off );
+        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
 
-            // Get absolute values of x vectors
-            const __m128i ax = _mm_sign_epi8(bx, bx);
+        __m128i bx = _mm_and_si128(lowMask, tmp);
+        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
 
-            // Sign the values of the y vectors
-            const __m128i sy = _mm_sign_epi8(by, bx);
-
-            // Perform multiplication and create 16-bit values
-            const __m128i dot = _mm_maddubs_epi16(ax, sy);
-
-            const __m128i ones = _mm_set1_epi16(1);
-            i32[j] = _mm_madd_epi16(ones, dot);
-        }
+        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
+        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
 
         // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
+        __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
+
         // Apply the scale, and accumulate
         acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
     }
@@ -2267,7 +2262,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
         const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
 
         // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
         const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
 
         const __m256 xy = mul_sum_i8_pairs_float(bx, by);
@@ -2471,7 +2466,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         /* Compute combined scale for the block */
         const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
 
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
         __m256i bxhi = bytes_from_bits_32(x[i].qh);
         bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
         bx = _mm256_or_si256(bx, bxhi);
@@ -2689,6 +2684,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
+
     float summs = 0.0f;
 
     // Main loop
@@ -2697,7 +2693,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 
         summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
 
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
         __m256i bxhi = bytes_from_bits_32(x[i].qh);
         bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
         bx = _mm256_or_si256(bx, bxhi);

From e038e01e28fda0dd35f9feb230705072447d6cc6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 May 2023 21:33:29 +0300
Subject: [PATCH 26/32] sha : update hashes for 7B and 13B

---
 SHA256SUMS | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/SHA256SUMS b/SHA256SUMS
index e487bdca6c9c2..c3f935a85b135 100644
--- a/SHA256SUMS
+++ b/SHA256SUMS
@@ -1,24 +1,27 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
-99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6  models/7B/ggml-model-q4_0.bin
-cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe  models/7B/ggml-model-q4_1.bin
-25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496  models/7B/ggml-model-q4_2.bin
+b734d7201dc7869855fe2861247178719607d96372f0fb1bf6a1c5810898a48f  models/7B/ggml-model-q4_0.bin
+1ea1d3e94d0012ee5c23ee5ee2c8909eb124a1e8e43c11108feb17879d8b9379  models/7B/ggml-model-q4_1.bin
+3232f282b40e3330093acb96e7d4983ce15b80a7e38b49d035e83b9aab753671  models/7B/ggml-model-q5_0.bin
+75b1e0ef9a7ba27d760e4239422e29a6ced0ff9c4f2537f1cc4754821bdb8d3e  models/7B/ggml-model-q5_1.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
-eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab  models/13B/ggml-model-q4_0.bin
-d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb  models/13B/ggml-model-q4_1.bin
-75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa  models/13B/ggml-model-q4_2.bin
+a8dd1a853a3227abda5b2046dcc23b1f06ee8b837bc97b34f6b182229eca21ff  models/13B/ggml-model-q4_0.bin
+3a58a576f0e188ad77bc5104407f1c7cf129928d1af2f920099fa206ca6af34a  models/13B/ggml-model-q4_1.bin
+814f9e369ca0daf4517b6a66bdf8d616c5d4ae8b4353fe091d15080e66965c34  models/13B/ggml-model-q5_0.bin
+74ab4eacb6ef14e08c7f06a2dd0b2630c3f920149324acf6651222ed397c430f  models/13B/ggml-model-q5_1.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
-517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d  models/30B/ggml-model-q4_0.bin
-7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd  models/30B/ggml-model-q4_1.bin
-aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204  models/30B/ggml-model-q4_2.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_1.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_1.bin
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
@@ -29,8 +32,9 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/con
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
-01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2  models/65B/ggml-model-q4_0.bin
-4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f  models/65B/ggml-model-q4_1.bin
-1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9  models/65B/ggml-model-q4_2.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_1.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_1.bin
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model

From 51c25fd99570db81d0a4c03041f89cf88630918d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 May 2023 21:38:47 +0300
Subject: [PATCH 27/32] readme : update timings + remove warning banner

---
 README.md | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 75bd9faf57e5d..396e3be657b54 100644
--- a/README.md
+++ b/README.md
@@ -7,14 +7,6 @@
 
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
-## ⚠️ TEMPORARY NOTICE ABOUT UPCOMING BREAKING CHANGE ⚠️
-
-**The quantization formats will soon be updated: https://github.com/ggerganov/llama.cpp/pull/1305**
-
-**All `ggml` model files using the old format will not work with the latest `llama.cpp` code after that change is merged**
-
----
-
 **Hot topics:**
 
 - [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
@@ -342,13 +334,13 @@ Several quantization methods are supported. They differ in the resulting model d
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
 |    7B | perplexity   | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
 |    7B | file size    |  13.0G |   4.0G |   4.8G |   4.4G |   4.8G |   7.1G |
-|    7B | ms/tok @ 4th |    128 |     56 |     61 |     91 |     95 |     75 |
-|    7B | ms/tok @ 8th |    128 |     47 |     55 |     53 |     59 |     75 |
+|    7B | ms/tok @ 4th |    128 |     50 |     54 |     75 |     83 |     75 |
+|    7B | ms/tok @ 8th |    123 |     44 |     52 |     53 |     58 |     72 |
 |    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
 |   13B | perplexity   | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
 |   13B | file size    |  25.0G |   7.6G |   9.1G |   8.4G |   9.1G |    14G |
-|   13B | ms/tok @ 4th |    239 |    104 |    113 |    176 |    185 |    141 |
-|   13B | ms/tok @ 8th |    240 |     85 |     99 |    108 |    117 |    147 |
+|   13B | ms/tok @ 4th |    239 |     93 |    101 |    150 |    164 |    141 |
+|   13B | ms/tok @ 8th |    240 |     81 |     96 |     96 |    104 |    136 |
 |   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
 
 ### Perplexity (measuring model quality)

From 1c87847b6bf10cf4ecc1d6b4b96f9d8b9449820c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 May 2023 21:48:56 +0300
Subject: [PATCH 28/32] llama : update v2 PR number to 1405

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index be9f8fffb8084..b2dbc6c3b0862 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -840,7 +840,7 @@ static const char *llama_file_version_name(llama_file_version version) {
     switch (version) {
         case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
         case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
-        case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1305)";
+        case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
         case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
     }
 }

From 832c53f4274353ec6f16a88d1c0e830526a229fc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 May 2023 21:59:25 +0300
Subject: [PATCH 29/32] ggml : fix WASM comments

---
 ggml.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index f4e626433ee1f..a9c10a295cdea 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2425,7 +2425,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         const v128_t v0l = wasm_v128_and (v0, m4b);
         const v128_t v0h = wasm_u8x16_shr(v0, 4);
 
-        // add high bit and sub 16
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
         const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
         const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
 
@@ -2570,7 +2570,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
         const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
 
-        // add 5th bit
+        // add high bit
         const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
         const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
         const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
@@ -2622,6 +2622,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
     uint32_t qh;
     uint64_t tmp[4];
 
+    // TODO: check if unrolling this is better
     for (int i = 0; i < nb; ++i) {
         const block_q5_1 * restrict x0 = &x[i];
         const block_q8_1 * restrict y0 = &y[i];

From ca7f069f39251a0289c99d0d55c373b2e181a381 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 May 2023 23:33:07 +0300
Subject: [PATCH 30/32] ggml : back to original bit order

---
 SHA256SUMS    |  16 +++----
 ggml-cuda.cu  |  21 ++++-----
 ggml-opencl.c |   1 -
 ggml.c        | 125 +++++++++++++++++++++-----------------------------
 llama.cpp     |   4 +-
 5 files changed, 73 insertions(+), 94 deletions(-)

diff --git a/SHA256SUMS b/SHA256SUMS
index c3f935a85b135..593c8efaa2bb7 100644
--- a/SHA256SUMS
+++ b/SHA256SUMS
@@ -1,17 +1,17 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
-b734d7201dc7869855fe2861247178719607d96372f0fb1bf6a1c5810898a48f  models/7B/ggml-model-q4_0.bin
-1ea1d3e94d0012ee5c23ee5ee2c8909eb124a1e8e43c11108feb17879d8b9379  models/7B/ggml-model-q4_1.bin
-3232f282b40e3330093acb96e7d4983ce15b80a7e38b49d035e83b9aab753671  models/7B/ggml-model-q5_0.bin
-75b1e0ef9a7ba27d760e4239422e29a6ced0ff9c4f2537f1cc4754821bdb8d3e  models/7B/ggml-model-q5_1.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_1.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_1.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
-a8dd1a853a3227abda5b2046dcc23b1f06ee8b837bc97b34f6b182229eca21ff  models/13B/ggml-model-q4_0.bin
-3a58a576f0e188ad77bc5104407f1c7cf129928d1af2f920099fa206ca6af34a  models/13B/ggml-model-q4_1.bin
-814f9e369ca0daf4517b6a66bdf8d616c5d4ae8b4353fe091d15080e66965c34  models/13B/ggml-model-q5_0.bin
-74ab4eacb6ef14e08c7f06a2dd0b2630c3f920149324acf6651222ed397c430f  models/13B/ggml-model-q5_1.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_1.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_1.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 08d1566bdd880..8a3beb0e54b88 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -86,8 +86,8 @@ static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf) - 8;
         const int x1 = (x[i].qs[j] >>  4) - 8;
 
-        y[i*qk + 2*j + 0] = x0*d;
-        y[i*qk + 2*j + 1] = x1*d;
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
     }
 }
 
@@ -105,8 +105,8 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf);
         const int x1 = (x[i].qs[j] >>  4);
 
-        y[i*qk + 2*j + 0] = x0*d + m;
-        y[i*qk + 2*j + 1] = x1*d + m;
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
     }
 }
 
@@ -129,8 +129,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
         const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
         const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
 
-        y[i*qk + 2*j + 0] = x0*d;
-        y[i*qk + 2*j + 1] = x1*d;
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
     }
 }
 
@@ -154,8 +154,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf) | xh_0;
         const int x1 = (x[i].qs[j] >>  4) | xh_1;
 
-        y[i*qk + 2*j + 0] = x0*d + m;
-        y[i*qk + 2*j + 1] = x1*d + m;
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
     }
 }
 
@@ -168,9 +168,8 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
 
     const float d = x[i].d;
 
-    for (int j = 0; j < qk/2; ++j) {
-        y[i*qk + 2*j + 0] = x[i].qs[j + 0   ]*d;
-        y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d;
+    for (int j = 0; j < qk; ++j) {
+        y[i*qk + j] = x[i].qs[j]*d;
     }
 }
 
diff --git a/ggml-opencl.c b/ggml-opencl.c
index 230c84f2fb411..0e6e6770f6307 100644
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -114,7 +114,6 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
     const uint i = get_global_id(0) / 32;
     const uint l = get_local_id(0);
 
-    // TODO: this is broken
     result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
 }
 
diff --git a/ggml.c b/ggml.c
index a9c10a295cdea..096ccacfb7e08 100644
--- a/ggml.c
+++ b/ggml.c
@@ -525,30 +525,14 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
     return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
 }
 
-static inline __m256i bytes_from_nibbles_32_deinterleave(const uint8_t * rsi) {
-    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
-    const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    return _mm256_and_si256(lowMask, bytes);
-}
-
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
 {
-    // Load 16 bytes from memory
-    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
-
-    // Expand bytes into uint16_t values
-    __m256i bytes = _mm256_cvtepu8_epi16( tmp );
-
-    // Unpack values into individual bytes
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
     const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    __m256i high = _mm256_andnot_si256( lowMask, bytes );
-    __m256i low = _mm256_and_si256( lowMask, bytes );
-    high = _mm256_slli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-    return bytes;
+    return _mm256_and_si256(lowMask, bytes);
 }
 
 // add int16_t pairwise and return as float vector
@@ -766,8 +750,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
         y[i].d = d;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 2*j + 0]*id;
-            const float x1 = x[i*qk + 2*j + 1]*id;
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
@@ -807,8 +791,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
         y[i].m = min;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 2*j + 0] - min)*id;
-            const float x1 = (x[i*qk + 2*j + 1] - min)*id;
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
@@ -850,8 +834,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 2*j + 0]*id;
-            const float x1 = x[i*qk + 2*j + 1]*id;
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
 
             const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
             const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
@@ -898,8 +882,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 2*j + 0] - min)*id;
-            const float x1 = (x[i*qk + 2*j + 1] - min)*id;
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
 
             const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
             const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
@@ -937,12 +921,10 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
 
         y[i].d = d;
 
-        for (int j = 0; j < QK8_0/2; ++j) {
-            const float v0 = x[i*QK8_0 + 2*j + 0]*id;
-            const float v1 = x[i*QK8_0 + 2*j + 1]*id;
+        for (int j = 0; j < QK8_0; ++j) {
+            const float x0 = x[i*QK8_0 + j]*id;
 
-            y[i].qs[          j] = roundf(v0);
-            y[i].qs[QK8_0/2 + j] = roundf(v1);
+            y[i].qs[j] = roundf(x0);
         }
     }
 }
@@ -978,13 +960,13 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
-            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
         }
     }
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) || defined(__AVX__)
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -1029,7 +1011,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         __m256i i2 = _mm256_cvtps_epi32( v2 );
         __m256i i3 = _mm256_cvtps_epi32( v3 );
 
-#if defined(__AVX2__) // || defined(__AVX__) TODO
+#if defined(__AVX2__)
         // Convert int32 to int16
         i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
         i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
@@ -1037,11 +1019,10 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 
         // We got our precious signed bytes, but the order is now wrong
-        // TODO: find a smarter way to do this
-        i2 = _mm256_permute2f128_si256(i0, i0, 0x01);
-        i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15));
-        i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1));
-        i0 = _mm256_or_si256(i1, i2);
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
 
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else
@@ -1097,8 +1078,8 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
         int sum = 0;
 
         for (int j = 0; j < QK8_1/2; ++j) {
-            const float v0 = x[i*QK8_1 + 2*j + 0]*id;
-            const float v1 = x[i*QK8_1 + 2*j + 1]*id;
+            const float v0 = x[i*QK8_1           + j]*id;
+            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
 
             y[i].qs[          j] = roundf(v0);
             y[i].qs[QK8_1/2 + j] = roundf(v1);
@@ -1143,17 +1124,17 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
-            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
 
             accv = vaddq_s32(accv, vi);
         }
 
         y[i].s = d * vaddvq_s32(accv);
     }
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) || defined(__AVX__)
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -1198,7 +1179,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         __m256i i2 = _mm256_cvtps_epi32( v2 );
         __m256i i3 = _mm256_cvtps_epi32( v3 );
 
-#if defined(__AVX2__) // || defined(__AVX__) TODO
+#if defined(__AVX2__)
         // Compute the sum of the quants and set y[i].s
         y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
 
@@ -1209,11 +1190,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 
         // We got our precious signed bytes, but the order is now wrong
-        // TODO: find a smarter way to do this
-        i2 = _mm256_permute2f128_si256(i0, i0, 0x01);
-        i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15));
-        i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1));
-        i0 = _mm256_or_si256(i1, i2);
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
 
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else
@@ -1266,8 +1246,8 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F) - 8;
             const int x1 = (x[i].qs[j] >>   4) - 8;
 
-            y[i*qk + 2*j + 0] = x0*d;
-            y[i*qk + 2*j + 1] = x1*d;
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
         }
     }
 }
@@ -1287,8 +1267,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F);
             const int x1 = (x[i].qs[j] >>   4);
 
-            y[i*qk + 2*j + 0] = x0*d + m;
-            y[i*qk + 2*j + 1] = x1*d + m;
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
         }
     }
 }
@@ -1313,8 +1293,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
             const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
             const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
 
-            y[i*qk + 2*j + 0] = x0*d;
-            y[i*qk + 2*j + 1] = x1*d;
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
         }
     }
 }
@@ -1340,8 +1320,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
             const int x1 = (x[i].qs[j] >>   4) | xh_1;
 
-            y[i*qk + 2*j + 0] = x0*d + m;
-            y[i*qk + 2*j + 1] = x1*d + m;
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
         }
     }
 }
@@ -1358,9 +1338,8 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
 
-        for (int j = 0; j < qk/2; ++j) {
-            y[i*qk + 2*j + 0] = x[i].qs[j + 0   ]*d;
-            y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d;
+        for (int j = 0; j < qk; ++j) {
+            y[i*qk + j] = x[i].qs[j]*d;
         }
     }
 }
@@ -2103,7 +2082,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         /* Compute combined scale for the block */
         const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
 
-        __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
 
         // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
         const __m256i off = _mm256_set1_epi8( 8 );
@@ -2262,7 +2241,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
         const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
 
         // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
+        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
         const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
 
         const __m256 xy = mul_sum_i8_pairs_float(bx, by);
@@ -2466,7 +2445,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         /* Compute combined scale for the block */
         const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
 
-        __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
         __m256i bxhi = bytes_from_bits_32(x[i].qh);
         bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
         bx = _mm256_or_si256(bx, bxhi);
@@ -2694,7 +2673,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 
         summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
 
-        __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
         __m256i bxhi = bytes_from_bits_32(x[i].qh);
         bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
         bx = _mm256_or_si256(bx, bxhi);
@@ -2719,8 +2698,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         int sumi = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
 
             const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
             const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
diff --git a/llama.cpp b/llama.cpp
index b2dbc6c3b0862..b27eb91e4f258 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -919,7 +919,9 @@ static void llama_model_load_internal(
     }
 
     if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
-        if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
+        if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
+            hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
+            hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
             throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
         }
     }

From b58b1f4bf6d5f7a4086d39fe269fbc06857cbf4f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 12 May 2023 00:00:40 +0300
Subject: [PATCH 31/32] readme : add note that Q4 and Q5 have been changed

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 396e3be657b54..8bc051c6b91f1 100644
--- a/README.md
+++ b/README.md
@@ -9,8 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 **Hot topics:**
 
+- Qauntization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
 - [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
-- [New quantization methods](https://github.com/ggerganov/llama.cpp#quantization)
 
 <details>
   <summary>Table of Contents</summary>

From cbb6a3a7e8e70ced0371ae3b41f4cd2d4dd97333 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 12 May 2023 00:08:36 +0300
Subject: [PATCH 32/32] llama : fix return for unknown version

---
 llama.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index b27eb91e4f258..0a47faa9d738d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -843,6 +843,8 @@ static const char *llama_file_version_name(llama_file_version version) {
         case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
         case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
     }
+
+    return "unknown";
 }
 
 static const char *llama_ftype_name(enum llama_ftype ftype) {