From 5fa47bf6c72c7ccc2c2dd36755c6cbba9c30dbc6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 3 May 2023 23:13:37 +0300 Subject: [PATCH 01/32] ggml : remove Q4_0 bit shufling (ARM NEON) --- ggml.c | 468 +++++---------------------------------------------------- 1 file changed, 39 insertions(+), 429 deletions(-) diff --git a/ggml.c b/ggml.c index 4e309df8a48e2..f957b786042ce 100644 --- a/ggml.c +++ b/ggml.c @@ -837,348 +837,52 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { + assert(QK4_0 / 16 == 0); assert(k % QK4_0 == 0); - const int nb = k / QK4_0; - uint8_t pp[QK4_0/2]; + const int nb = k / QK4_0; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max - float max = 0.0f; + float max = 0.0f; for (int l = 0; l < QK4_0; l++) { const float v = x[i*QK4_0 + l]; if (amax < fabsf(v)) { amax = fabsf(v); - max = v; + max = v; } } - const float d = max / -8; + const float d = max / -8; const float id = d ? 1.0f/d : 0.0f; y[i].d = d; - for (int l = 0; l < QK4_0; l += 2) { - const float v0 = x[i*QK4_0 + l + 0]*id; - const float v1 = x[i*QK4_0 + l + 1]*id; + uint64_t qs[QK4_0 / 16] = {0}; - const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8); - const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8); + // pack first half of weights into low nibbles and second half into high nibbles + for (int l = 0; l < QK4_0/2; ++l) { + const float v0 = x[i*QK4_0 + 0 + l]*id; + const float v1 = x[i*QK4_0 + QK4_0/2 + l]*id; - assert(vi0 < 16); - assert(vi1 < 16); + const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f)); + const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f)); - pp[l/2] = vi0 | (vi1 << 4); + qs[l/8] |= vi0 << (8*(l & 7)); + qs[l/8] |= vi1 << (8*(l & 7) + 4); } - memcpy(y[i].qs, pp, sizeof(pp)); + memcpy(y[i].qs, qs, sizeof(qs)); } } static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) { assert(k % QK4_0 == 0); - const int nb = k / QK4_0; block_q4_0 * restrict y = vy; -#if defined(__POWER9_VECTOR__) - const vector float v85 = vec_splats(8.5f); - const vector signed int v15 = vec_splats(15); - for (int i = 0; i < nb; i++) { - float max = 0.0f; - float min = 0.0f; - - vector float asrcv [8]; - vector float srcv [8]; - vector float maxv[8]; - vector float minv[8]; - - for (int l = 0; l < 8; l++) srcv[l] = *(vector float *)(x + i*32 + 4*l); - //for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]); - - for (int l = 0; l < 4; l++) maxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]); - //for (int l = 0; l < 2; l++) maxv[4*l] = vec_max(maxv[4*l], maxv[4*l+2]); - maxv[0] = vec_max(maxv[0], maxv[2]); - maxv[4] = vec_max(maxv[4], maxv[6]); - //for (int l = 0; l < 1; l++) maxv[8*l] = vec_max(maxv[8*l], maxv[8*l+4]); - maxv[0] = vec_max(maxv[0], maxv[4]); - - for (int l = 0; l < 4; l++) minv[2*l] = vec_min(asrcv[2*l], asrcv[2*l+1]); - //for (int l = 0; l < 2; l++) minv[4*l] = vec_min(minv[4*l], minv[4*l+2]); - minv[0] = vec_min(minv[0], minv[2]); - minv[4] = vec_min(minv[4], minv[6]); - //for (int l = 0; l < 1; l++) minv[8*l] = vec_min(minv[8*l], minv[8*l+4]); - minv[0] = vec_min(minv[0], minv[4]); - - - max = MAX( - MAX(vec_extract(maxv[0], 0), vec_extract(maxv[0], 1)), - MAX(vec_extract(maxv[0], 2), vec_extract(maxv[0], 3))); - min = MIN( - MIN(vec_extract(minv[0], 0), vec_extract(minv[0], 1)), - MIN(vec_extract(minv[0], 2), vec_extract(minv[0], 3))); - - const float magnitude = max >= fabsf(min) ? max : min; - const float d = magnitude / -8; - const float id = d ? 1.0/d : 0.0; - - y[i].d = d; - - const vector float vid = vec_splats(id); - uint8_t * restrict pb = y[i].qs; - for (int l = 0; l < 8; l++) { - const vector float vf = vec_madd(srcv[l], vid, v85); - const vector signed int vi = vec_signed(vf); - const vector signed int vc = vec_min(vi, v15); - - pb[2*l + 0] = vec_extract(vc, 0) | (vec_extract(vc, 1) << 4); - pb[2*l + 1] = vec_extract(vc, 2) | (vec_extract(vc, 3) << 4); - } - } -#elif __ARM_NEON - for (int i = 0; i < nb; i++) { - float32x4_t srcv [8]; - float32x4_t maxv[8]; - float32x4_t minv[8]; - - for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); - - for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l+1]); - for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l+2]); - for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l+4]); - - for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l+1]); - for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l+2]); - for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l+4]); - - const float max = vmaxvq_f32(maxv[0]); - const float min = vminvq_f32(minv[0]); - - const float magnitude = max >= fabsf(min) ? max : min; - const float d = magnitude / -8; - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - - for (int l = 0; l < 8; l++) { - const float32x4_t v = vmulq_n_f32(srcv[l], id); - const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f)); - const int32x4_t vi = vcvtq_s32_f32(vf); - const int32x4_t vc = vminq_s32(vi, vdupq_n_s32(15)); - - y[i].qs[2*l + 0] = vgetq_lane_s32(vc, 0) | (vgetq_lane_s32(vc, 1) << 4); - y[i].qs[2*l + 1] = vgetq_lane_s32(vc, 2) | (vgetq_lane_s32(vc, 3) << 4); - } - } -#elif defined(__AVX2__) - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max for the block - __m256 max = _mm256_max_ps( v0, v1 ); - __m256 maxTmp = _mm256_max_ps( v2, v3 ); - max = _mm256_max_ps( max, maxTmp ); - - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) ); - max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); - max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float maxScalar = _mm_cvtss_f32( max4 ); - - // Compute min for the block - __m256 min = _mm256_min_ps( v0, v1 ); - __m256 minTmp = _mm256_min_ps( v2, v3 ); - min = _mm256_min_ps( min, minTmp ); - - __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) ); - min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) ); - min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) ); - const float minScalar = _mm_cvtss_f32( min4 ); - - // Quantize these floats - const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar; - const float d = magnitude / -8.0f; - y[i].d = d; - const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f; - const __m256 mul = _mm256_set1_ps( id ); - - // Apply the multiplier - v0 = _mm256_mul_ps( v0, mul ); - v1 = _mm256_mul_ps( v1, mul ); - v2 = _mm256_mul_ps( v2, mul ); - v3 = _mm256_mul_ps( v3, mul ); - - // Round to nearest integer - v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); - v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); - v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); - v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - - // Convert int32 to int16 - i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 - i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 - // Convert int16 to int8 - i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 - - // We got our precious signed bytes, but the order is now wrong - // These AVX2 pack instructions process 16-byte pieces independently - // The following instruction is fixing the order - const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); - i0 = _mm256_permutevar8x32_epi32( i0, perm ); - - // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ] - const __m256i off = _mm256_set1_epi8( 8 ); - i0 = _mm256_add_epi8( i0, off ); - const __m256i maxNibble = _mm256_set1_epi8( 15 ); - i0 = _mm256_min_epi8( i0, maxNibble ); - - // Compress the vector into 4 bit/value, and store - __m128i res = packNibbles( i0 ); - _mm_storeu_si128( ( __m128i* )y[i].qs, res ); - } -#elif defined(__AVX__) - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max for the block - __m256 max = _mm256_max_ps( v0, v1 ); - __m256 maxTmp = _mm256_max_ps( v2, v3 ); - max = _mm256_max_ps( max, maxTmp ); - - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) ); - max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); - max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float maxScalar = _mm_cvtss_f32( max4 ); - - // Compute min for the block - __m256 min = _mm256_min_ps( v0, v1 ); - __m256 minTmp = _mm256_min_ps( v2, v3 ); - min = _mm256_min_ps( min, minTmp ); - - __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) ); - min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) ); - min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) ); - const float minScalar = _mm_cvtss_f32( min4 ); - - // Quantize these floats - const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar; - const float d = magnitude / -8.0f; - y[i].d = d; - const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f; - const __m256 mul = _mm256_set1_ps( id ); - - // Apply the multiplier - v0 = _mm256_mul_ps( v0, mul ); - v1 = _mm256_mul_ps( v1, mul ); - v2 = _mm256_mul_ps( v2, mul ); - v3 = _mm256_mul_ps( v3, mul ); - - // Round to nearest integer - v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); - v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); - v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); - v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - - // Since we don't have in AVX some necessary functions, - // we split the registers in half and call AVX2 analogs from SSE - __m128i ni0 = _mm256_castsi256_si128( i0 ); - __m128i ni1 = _mm256_extractf128_si256( i0, 1); - __m128i ni2 = _mm256_castsi256_si128( i1 ); - __m128i ni3 = _mm256_extractf128_si256( i1, 1); - __m128i ni4 = _mm256_castsi256_si128( i2 ); - __m128i ni5 = _mm256_extractf128_si256( i2, 1); - __m128i ni6 = _mm256_castsi256_si128( i3 ); - __m128i ni7 = _mm256_extractf128_si256( i3, 1); - - // Convert int32 to int16 - ni0 = _mm_packs_epi32( ni0, ni1 ); - ni2 = _mm_packs_epi32( ni2, ni3 ); - ni4 = _mm_packs_epi32( ni4, ni5 ); - ni6 = _mm_packs_epi32( ni6, ni7 ); - // Convert int16 to int8 - ni0 = _mm_packs_epi16( ni0, ni2 ); - ni4 = _mm_packs_epi16( ni4, ni6 ); - - // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ] - const __m128i off = _mm_set1_epi8( 8 ); - ni0 = _mm_add_epi8( ni0, off ); - ni4 = _mm_add_epi8( ni4, off ); - const __m128i maxNibble = _mm_set1_epi8( 15 ); - ni0 = _mm_min_epi8( ni0, maxNibble ); - ni4 = _mm_min_epi8( ni4, maxNibble ); - - // Compress the vector into 4 bit/value, and store - __m128i res = packNibbles( ni0, ni4 ); - _mm_storeu_si128( ( __m128i* )y[i].qs, res ); - } -#elif defined(__wasm_simd128__) - for (int i = 0; i < nb; i++) { - float max = 0.0f; - float min = 0.0f; - - v128_t srcv [8]; - v128_t maxv[8]; - v128_t minv[8]; - - for (int l = 0; l < 8; l++) srcv[l] = wasm_v128_load(x + i*32 + 4*l); - - for (int l = 0; l < 4; l++) maxv[2*l] = wasm_f32x4_max(srcv[2*l], srcv[2*l+1]); - for (int l = 0; l < 2; l++) maxv[4*l] = wasm_f32x4_max(maxv[4*l], maxv[4*l+2]); - for (int l = 0; l < 1; l++) maxv[8*l] = wasm_f32x4_max(maxv[8*l], maxv[8*l+4]); - - for (int l = 0; l < 4; l++) minv[2*l] = wasm_f32x4_min(srcv[2*l], srcv[2*l+1]); - for (int l = 0; l < 2; l++) minv[4*l] = wasm_f32x4_min(minv[4*l], minv[4*l+2]); - for (int l = 0; l < 1; l++) minv[8*l] = wasm_f32x4_min(minv[8*l], minv[8*l+4]); - - max = MAX( - MAX(wasm_f32x4_extract_lane(maxv[0], 0), wasm_f32x4_extract_lane(maxv[0], 1)), - MAX(wasm_f32x4_extract_lane(maxv[0], 2), wasm_f32x4_extract_lane(maxv[0], 3))); - min = MIN( - MIN(wasm_f32x4_extract_lane(minv[0], 0), wasm_f32x4_extract_lane(minv[0], 1)), - MIN(wasm_f32x4_extract_lane(minv[0], 2), wasm_f32x4_extract_lane(minv[0], 3))); - - const float magnitude = max >= fabsf(min) ? max : min; - const float d = magnitude / -8; - const float id = d ? 1.0/d : 0.0; - - y[i].d = d; - - for (int l = 0; l < 8; l++) { - const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id)); - const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f)); - const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf); - const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15)); - - y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4); - y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4); - } - } -#else - // scalar quantize_row_q4_0_reference(x, y, k); -#endif } static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) { @@ -1843,121 +1547,33 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int } static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) { + assert(QK4_0 / 16 == 0); assert(k % QK4_0 == 0); + const int nb = k / QK4_0; const block_q4_0 * restrict x = vx; -#if defined(__AVX2__) - for (int i = 0; i < nb; i++) { - // scale factor - const __m256 d_v = _mm256_broadcast_ss(&x[i].d); - - const uint8_t * restrict pp = x[i].qs; - - for (int l = 0; l < QK4_0; l += 32) { - // Load 32x4-bit integers into 32x8-bit integers - __m256i vx8 = bytes_from_nibbles_32(pp+l/2); - - // Subtract 8 from the integers - vx8 = _mm256_sub_epi8(vx8, _mm256_set1_epi8(8)); - - // Convert to 16-bit int - const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0)); - const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1)); - - // Convert to 32-bit int -> float 32 - const __m256 vf[4] = { - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))), - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))), - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))), - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1))) - }; - - // Scale and store - for (int j = 0; j < 4; j++) { - const __m256 result = _mm256_mul_ps(vf[j], d_v); - _mm256_storeu_ps(y + i * QK4_0 + l + j*8, result); - } - } - } -#elif defined(__ARM_NEON) - for (int i = 0; i < nb; i++) { - const float32x4_t vd = vdupq_n_f32(x[i].d); - - const uint8_t * restrict pp = x[i].qs; - - for (int l = 0; l < QK4_0; l += 16) { - // Load 16x4-bit integers into 8x8-bit integers - const uint8x8_t v8 = vld1_u8(pp + l/2); - - // Expand 4-bit qs to 8-bit bytes - const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F)); - const uint8x8_t v1 = vshr_n_u8(v8, 4); - - // Convert to signed 8-bit integers - const int8x8_t vs_0 = vreinterpret_s8_u8(v0); - const int8x8_t vs_1 = vreinterpret_s8_u8(v1); - - // Subtract 8 from each byte - const int8x8_t vb_0 = vsub_s8(vs_0, vdup_n_s8(8)); - const int8x8_t vb_1 = vsub_s8(vs_1, vdup_n_s8(8)); - - // Interleave and combine - const int8x8_t vx_0 = vzip1_s8(vb_0, vb_1); - const int8x8_t vx_1 = vzip2_s8(vb_0, vb_1); - - const int8x16_t vq = vcombine_s8(vx_0, vx_1); - - // convert to 2x int16x8_t - const int16x8_t vi_0 = vmovl_s8(vget_low_s8 (vq)); - const int16x8_t vi_1 = vmovl_s8(vget_high_s8(vq)); - - // convert to 4x float32x4_t - const float32x4_t vf_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_0))); - const float32x4_t vf_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_0))); - const float32x4_t vf_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_1))); - const float32x4_t vf_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_1))); - - // Multiply by d - const float32x4_t r0 = vmulq_f32(vf_0, vd); - const float32x4_t r1 = vmulq_f32(vf_1, vd); - const float32x4_t r2 = vmulq_f32(vf_2, vd); - const float32x4_t r3 = vmulq_f32(vf_3, vd); - - // Store - vst1q_f32(y + i*QK4_0 + l + 0, r0); - vst1q_f32(y + i*QK4_0 + l + 4, r1); - vst1q_f32(y + i*QK4_0 + l + 8, r2); - vst1q_f32(y + i*QK4_0 + l + 12, r3); - } - } -#else - // scalar for (int i = 0; i < nb; i++) { const float d = x[i].d; - const uint8_t * restrict pp = x[i].qs; + // unpack nibbles into bytes + uint64_t qs[QK4_0 / 8] = {0}; - for (int l = 0; l < QK4_0; l += 2) { - const uint8_t vi = pp[l/2]; + memcpy(qs + 0, x[i].qs, sizeof(x[i].qs)); + memcpy(qs + QK4_0 / 16, x[i].qs, sizeof(x[i].qs)); - const int8_t vi0 = vi & 0x0F; - const int8_t vi1 = vi >> 4; - - const float v0 = (vi0 - 8)*d; - const float v1 = (vi1 - 8)*d; - - //printf("d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\n", d, vi, vi0, vi1, v0, v1); + for (int l = 0; l < QK4_0 / 16; ++l) { + qs[l ] = (qs[l ] & 0x0F0F0F0F0F0F0F0FULL) >> 0; + qs[l + QK4_0/16] = (qs[l + QK4_0/16] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; + } - y[i*QK4_0 + l + 0] = v0; - y[i*QK4_0 + l + 1] = v1; + const uint8_t * restrict qsp = (const uint8_t * restrict) qs; - assert(!isnan(y[i*QK4_0 + l + 0])); - assert(!isnan(y[i*QK4_0 + l + 1])); + for (int l = 0; l < QK4_0; ++l) { + y[i*QK4_0 + l] = (qsp[l] - 8)*d; } } -#endif } static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) { @@ -2887,12 +2503,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); - // interleave - const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs); - const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs); - const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs); - const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs); - // load y const int8x16_t v1_0l = vld1q_s8(y0->qs); const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); @@ -2901,21 +2511,21 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * #if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h); + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); From 844d2af89dcfe87129428eacd3d03feba4de8ee6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 May 2023 20:53:14 +0300 Subject: [PATCH 02/32] ggml : remove Q4_1 bit shuffling (ARM NEON + reference) --- ggml.c | 416 +++++++++++++++------------------------------------------ 1 file changed, 111 insertions(+), 305 deletions(-) diff --git a/ggml.c b/ggml.c index f957b786042ce..cb8cea045567b 100644 --- a/ggml.c +++ b/ggml.c @@ -837,17 +837,19 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { - assert(QK4_0 / 16 == 0); - assert(k % QK4_0 == 0); + static const int qk = QK4_0; - const int nb = k / QK4_0; + assert(qk / 16 == 0); + assert(k % qk == 0); + + const int nb = k / qk; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max float max = 0.0f; - for (int l = 0; l < QK4_0; l++) { - const float v = x[i*QK4_0 + l]; + for (int l = 0; l < qk; l++) { + const float v = x[i*qk + l]; if (amax < fabsf(v)) { amax = fabsf(v); max = v; @@ -862,9 +864,9 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r uint64_t qs[QK4_0 / 16] = {0}; // pack first half of weights into low nibbles and second half into high nibbles - for (int l = 0; l < QK4_0/2; ++l) { - const float v0 = x[i*QK4_0 + 0 + l]*id; - const float v1 = x[i*QK4_0 + QK4_0/2 + l]*id; + for (int l = 0; l < qk/2; ++l) { + const float v0 = x[i*qk + 0 + l]*id; + const float v1 = x[i*qk + qk/2 + l]*id; const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f)); const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f)); @@ -877,176 +879,55 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r } } -static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) { - assert(k % QK4_0 == 0); - - block_q4_0 * restrict y = vy; - +static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { quantize_row_q4_0_reference(x, y, k); } -static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) { - assert(k % QK4_1 == 0); - const int nb = k / QK4_1; +static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { + const int qk = QK4_1; - block_q4_1 * restrict y = vy; + assert(qk / 16 == 0); + assert(k % qk == 0); - uint8_t pp[QK4_1/2]; + const int nb = k / qk; for (int i = 0; i < nb; i++) { float min = FLT_MAX; float max = -FLT_MAX; - for (int l = 0; l < QK4_1; l++) { - const float v = x[i*QK4_1 + l]; + for (int l = 0; l < qk; l++) { + const float v = x[i*qk + l]; + if (v < min) min = v; if (v > max) max = v; } - const float d = (max - min) / ((1 << 4) - 1); + const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; y[i].d = d; y[i].m = min; - for (int l = 0; l < QK4_1; l += 2) { - const float v0 = (x[i*QK4_1 + l + 0] - min)*id; - const float v1 = (x[i*QK4_1 + l + 1] - min)*id; + uint64_t qs[QK4_1 / 16] = {0}; - const uint8_t vi0 = roundf(v0); - const uint8_t vi1 = roundf(v1); + // pack first half of weights into low nibbles and second half into high nibbles + for (int l = 0; l < qk/2; ++l) { + const float v0 = (x[i*qk + 0 + l] - min)*id; + const float v1 = (x[i*qk + qk/2 + l] - min)*id; - assert(vi0 < 16); - assert(vi1 < 16); + const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f)); + const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f)); - pp[l/2] = vi0 | (vi1 << 4); + qs[l/8] |= vi0 << (8*(l & 7)); + qs[l/8] |= vi1 << (8*(l & 7) + 4); } - memcpy(y[i].qs, pp, sizeof(pp)); + memcpy(y[i].qs, qs, sizeof(qs)); } } -static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) { - assert(k % QK4_1 == 0); - - const int nb = k / QK4_1; - - block_q4_1 * restrict y = vy; - -#if defined(__AVX2__) - for (int i = 0; i < nb; i++) { - // Load elements into 4 AVX vectors - __m256 v0 = _mm256_loadu_ps( x ); - __m256 v1 = _mm256_loadu_ps( x + 8 ); - __m256 v2 = _mm256_loadu_ps( x + 16 ); - __m256 v3 = _mm256_loadu_ps( x + 24 ); - x += 32; - - // Compute max for the block - __m256 vmax; - vmax = _mm256_max_ps( v0, v1 ); - vmax = _mm256_max_ps( vmax, v2 ); - vmax = _mm256_max_ps( vmax, v3 ); - - __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( vmax, 1 ), _mm256_castps256_ps128( vmax ) ); - max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); - max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); - const float maxScalar = _mm_cvtss_f32( max4 ); - - // Compute min for the block - __m256 vmin; - vmin = _mm256_min_ps( v0, v1 ); - vmin = _mm256_min_ps( vmin, v2 ); - vmin = _mm256_min_ps( vmin, v3 ); - - __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( vmin, 1 ), _mm256_castps256_ps128( vmin ) ); - min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) ); - min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) ); - const float minScalar = _mm_cvtss_f32( min4 ); - - // Quantize these floats - const float d = (maxScalar - minScalar) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].m = minScalar; - y[i].d = d; - - // x = (x-min)*id - const __m256 mul = _mm256_set1_ps( id ); - const __m256 off = _mm256_set1_ps( minScalar ); - v0 = _mm256_mul_ps( _mm256_sub_ps( v0, off ), mul ); - v1 = _mm256_mul_ps( _mm256_sub_ps( v1, off ), mul ); - v2 = _mm256_mul_ps( _mm256_sub_ps( v2, off ), mul ); - v3 = _mm256_mul_ps( _mm256_sub_ps( v3, off ), mul ); - - // Round to nearest integer - v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); - v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); - v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); - v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); - - // Convert floats to integers - __m256i i0 = _mm256_cvtps_epi32( v0 ); - __m256i i1 = _mm256_cvtps_epi32( v1 ); - __m256i i2 = _mm256_cvtps_epi32( v2 ); - __m256i i3 = _mm256_cvtps_epi32( v3 ); - - // Convert int32 to int16 - i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 - i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 - // Convert int16 to int8 - i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 - - // We got our precious signed bytes, but the order is now wrong - // These AVX2 pack instructions process 16-byte pieces independently - // The following instruction is fixing the order - const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); - i0 = _mm256_permutevar8x32_epi32( i0, perm ); - - // Compress the vector into 4 bit/value, and store - __m128i res = packNibbles( i0 ); - _mm_storeu_si128( ( __m128i* )y[i].qs, res ); - } -#elif __ARM_NEON - for (int i = 0; i < nb; i++) { - float32x4_t srcv[8]; - float32x4_t minv[8]; - float32x4_t maxv[8]; - - for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK4_1 + 4*l); - - for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]); - for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]); - for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]); - - for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]); - for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]); - for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]); - - const float min = vminvq_f32(minv[0]); - const float max = vmaxvq_f32(maxv[0]); - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - y[i].m = min; - - const float32x4_t minv0 = vdupq_n_f32(min); - - for (int l = 0; l < 8; l++) { - const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id); - const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest - const int32x4_t vi = vcvtq_s32_f32(vf); - - y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); - y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); - } - } -#else - // scalar - quantize_row_q4_1_reference(x, vy, k); -#endif +static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { + quantize_row_q4_1_reference(x, y, k); } // reference implementation for deterministic creation of model files @@ -1546,13 +1427,13 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int #endif } -static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) { - assert(QK4_0 / 16 == 0); - assert(k % QK4_0 == 0); +static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { + static const int qk = QK4_0; - const int nb = k / QK4_0; + assert(qk / 16 == 0); + assert(k % qk == 0); - const block_q4_0 * restrict x = vx; + const int nb = k / qk; for (int i = 0; i < nb; i++) { const float d = x[i].d; @@ -1560,126 +1441,49 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in // unpack nibbles into bytes uint64_t qs[QK4_0 / 8] = {0}; - memcpy(qs + 0, x[i].qs, sizeof(x[i].qs)); - memcpy(qs + QK4_0 / 16, x[i].qs, sizeof(x[i].qs)); + memcpy(qs + 0, x[i].qs, sizeof(x[i].qs)); - for (int l = 0; l < QK4_0 / 16; ++l) { - qs[l ] = (qs[l ] & 0x0F0F0F0F0F0F0F0FULL) >> 0; - qs[l + QK4_0/16] = (qs[l + QK4_0/16] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; + for (int l = 0; l < qk / 16; ++l) { + qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; + qs[l + 0 ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0; } const uint8_t * restrict qsp = (const uint8_t * restrict) qs; - for (int l = 0; l < QK4_0; ++l) { - y[i*QK4_0 + l] = (qsp[l] - 8)*d; + for (int l = 0; l < qk; ++l) { + y[i*qk + l] = (qsp[l] - 8)*d; } } } -static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) { - assert(k % QK4_1 == 0); - const int nb = k / QK4_1; - - const block_q4_1 * restrict x = vx; - -#if defined(__AVX2__) - for (int i = 0; i < nb; i++) { - const __m256 d_v = _mm256_broadcast_ss(&x[i].d); - const __m256 d_m = _mm256_broadcast_ss(&x[i].m); - - const uint8_t * restrict pp = x[i].qs; - - for (int l = 0; l < QK4_1; l += 32) { - // Load 32x4-bit integers into 32x8-bit integers - __m256i vx8 = bytes_from_nibbles_32(pp+l/2); +static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { + static const int qk = QK4_1; - // Convert to 16-bit int - const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0)); - const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1)); + assert(qk / 16 == 0); + assert(k % qk == 0); - // Convert to 32-bit int -> float 32 - const __m256 vf[4] = { - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))), - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))), - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))), - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1))) - }; - - // Scale, add m and store - for (int j = 0; j < 4; j++) { - const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m); - _mm256_storeu_ps(y + i * QK4_1 + l + j*8, result); - } - } - } -#elif defined(__ARM_NEON) - for (int i = 0; i < nb; i++) { - const float32x4_t vd = vdupq_n_f32(x[i].d); - const float32x4_t vm = vdupq_n_f32(x[i].m); - - const uint8_t * restrict pp = x[i].qs; - - for (int l = 0; l < QK4_1; l += 16) { - // Load 16x4-bit integers into 8x8-bit integers - const uint8x8_t v8 = vld1_u8(pp + l/2); - - // Expand 4-bit qs to 8-bit bytes - const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F)); - const uint8x8_t v1 = vshr_n_u8(v8, 4); + const int nb = k / qk; - // Interleave and combine - const uint8x8_t vx_0 = vzip1_u8(v0, v1); - const uint8x8_t vx_1 = vzip2_u8(v0, v1); - - const uint8x16_t vq = vcombine_u8(vx_0, vx_1); - - // convert to 2x uint16x8_t - const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq)); - const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq)); - - // convert to 4x float32x4_t - const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0))); - const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0))); - const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1))); - const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1))); - - // multiply by d and add m - const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd); - const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd); - const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd); - const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd); - - // Store - vst1q_f32(y + i*QK4_1 + l + 0, r0); - vst1q_f32(y + i*QK4_1 + l + 4, r1); - vst1q_f32(y + i*QK4_1 + l + 8, r2); - vst1q_f32(y + i*QK4_1 + l + 12, r3); - } - } -#else for (int i = 0; i < nb; i++) { const float d = x[i].d; const float m = x[i].m; - const uint8_t * restrict pp = x[i].qs; - - for (int l = 0; l < QK4_1; l += 2) { - const uint8_t vi = pp[l/2]; + // unpack nibbles into bytes + uint64_t qs[QK4_0 / 8] = {0}; - const int8_t vi0 = vi & 0x0F; - const int8_t vi1 = vi >> 4; + memcpy(qs + 0, x[i].qs, sizeof(x[i].qs)); - const float v0 = vi0*d + m; - const float v1 = vi1*d + m; + for (int l = 0; l < qk / 16; ++l) { + qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; + qs[l + 0 ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0; + } - y[i*QK4_1 + l + 0] = v0; - y[i*QK4_1 + l + 1] = v1; + const uint8_t * restrict qsp = (const uint8_t * restrict) qs; - assert(!isnan(y[i*QK4_1 + l + 0])); - assert(!isnan(y[i*QK4_1 + l + 1])); + for (int l = 0; l < qk; ++l) { + y[i*qk + l] = qsp[l]*d + m; } } -#endif } static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, int k) { @@ -1810,7 +1614,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { - .dequantize_row_q = dequantize_row_q4_0, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0, .quantize_row_q = quantize_row_q4_0, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, .quantize_row_q_dot = quantize_row_q8_0, @@ -1818,7 +1622,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q4_1] = { - .dequantize_row_q = dequantize_row_q4_1, + .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1, .quantize_row_q = quantize_row_q4_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, .quantize_row_q_dot = quantize_row_q8_1, @@ -2467,9 +2271,10 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t } static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK8_0; + const int qk = QK8_0; + const int nb = n / qk; - assert(n % QK8_0 == 0); + assert(n % qk == 0); assert(nb % 2 == 0); const block_q4_0 * restrict x = vx; @@ -2604,41 +2409,45 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * #else // scalar float sumf = 0.0; + for (int i = 0; i < nb; i++) { - const float d0 = x[i].d; - const float d1 = y[i].d; + // unpack nibbles into bytes + uint64_t qs[QK8_0 / 8] = {0}; - const uint8_t * restrict p0 = x[i].qs; - const int8_t * restrict p1 = y[i].qs; + memcpy(qs + 0, x[i].qs, sizeof(x[i].qs)); - int sumi = 0; - for (int j = 0; j < QK8_0/2; j++) { - const uint8_t v0 = p0[j]; + for (int l = 0; l < qk / 16; ++l) { + qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; + qs[l + 0 ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0; + } - const int i0 = (int8_t) (v0 & 0x0F) - 8; - const int i1 = (int8_t) (v0 >> 4) - 8; + const uint8_t * restrict px = (const uint8_t * restrict) qs; + const int8_t * restrict py = y[i].qs; - const int i2 = p1[2*j + 0]; - const int i3 = p1[2*j + 1]; + int sumi = 0; - sumi += i0*i2 + i1*i3; + for (int j = 0; j < qk; ++j) { + sumi += (px[j] - 8) * py[j]; } - sumf += d0*d1*sumi; + + sumf += (x[i].d*y[i].d)*sumi; } + *s = sumf; #endif } static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK8_1; + const int qk = QK8_1; + const int nb = n / qk; - assert(n % QK8_1 == 0); + assert(n % qk == 0); assert(nb % 2 == 0); const block_q4_1 * restrict x = vx; const block_q8_1 * restrict y = vy; - // TODO: add AVX / WASM SIMD / etc + // TODO: add WASM SIMD #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -2664,12 +2473,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - // interleave - const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h); - const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h); - const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h); - const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h); - // load y const int8x16_t v1_0l = vld1q_s8(y0->qs); const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); @@ -2678,21 +2481,21 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * #if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h); + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h)); - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h)); + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h)); const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); @@ -2738,27 +2541,30 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * #else // scalar float sumf = 0.0; + for (int i = 0; i < nb; i++) { - const float d0 = x[i].d; - const float m0 = x[i].m; - const float d1 = y[i].d; + // unpack nibbles into bytes + uint64_t qs[QK8_1 / 8] = {0}; - const uint8_t * restrict p0 = x[i].qs; - const int8_t * restrict p1 = y[i].qs; + memcpy(qs + 0, x[i].qs, sizeof(x[i].qs)); - // TODO: this is very slow .. - for (int j = 0; j < QK8_1/2; j++) { - const uint8_t v0 = p0[j]; + for (int l = 0; l < qk / 16; ++l) { + qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; + qs[l + 0 ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0; + } - const float f0 = d0*(v0 & 0x0F) + m0; - const float f1 = d0*(v0 >> 4) + m0; + const uint8_t * restrict px = (const uint8_t * restrict) qs; + const int8_t * restrict py = y[i].qs; - const float f2 = d1*p1[2*j + 0]; - const float f3 = d1*p1[2*j + 1]; + int sumi = 0; - sumf += f0*f2 + f1*f3; + for (int j = 0; j < qk; ++j) { + sumi += px[j]*py[j]; } + + sumf += (x[i].d*y[i].d)*sumi + x[i].m*(y[i].s0 + y[i].s1); } + *s = sumf; #endif } @@ -12707,7 +12513,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK4_0; for (int j = 0; j < n; j += k) { - block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK4_0; + block_q4_0 * restrict y = (block_q4_0 *) dst + j/QK4_0; quantize_row_q4_0_reference(src + j, y, k); @@ -12730,7 +12536,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK4_1; for (int j = 0; j < n; j += k) { - block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK4_1; + block_q4_1 * restrict y = (block_q4_1 *) dst + j/QK4_1; quantize_row_q4_1_reference(src + j, y, k); From fd2a137fac3a2ca48d7ef16fea2bf12c1f401397 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 May 2023 21:51:42 +0300 Subject: [PATCH 03/32] ggml : nibbles_from_floats() + bytes_from_nibbles() (ARM NEON) --- ggml.c | 131 ++++++++++++++++++++++++++------------------------------- 1 file changed, 60 insertions(+), 71 deletions(-) diff --git a/ggml.c b/ggml.c index cb8cea045567b..f165129eaad45 100644 --- a/ggml.c +++ b/ggml.c @@ -615,6 +615,50 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) #if __ARM_NEON +static inline const uint8_t * bytes_from_nibbles_64(const int qk, const uint8_t * qs, uint64_t * qd) { + memcpy(qd, qs, qk/2); + + for (int l = 0; l < qk/16; ++l) { + qd[l + qk/16] = (qd[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; + qd[l + 0 ] = (qd[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0; + } + + return (const uint8_t *) qd; +} + +// pack first half of weights into low nibbles and second half into high nibbles +// use one scaling factor +static inline void nibbles_from_floats_64_0(const int qk, const float * x, float id, uint8_t * qs, uint64_t * qd) { + for (int l = 0; l < qk/2; ++l) { + const float v0 = x[0 + l]*id; + const float v1 = x[qk/2 + l]*id; + + const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f)); + const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f)); + + qd[l/8] |= vi0 << (8*(l & 7)); + qd[l/8] |= vi1 << (8*(l & 7) + 4); + } + + memcpy(qs, qd, qk/2); +} + +// use offset and scaling factor +static inline void nibbles_from_floats_64_1(const int qk, const float * x, float id, float min, uint8_t * qs, uint64_t * qd) { + for (int l = 0; l < qk/2; ++l) { + const float v0 = (x[0 + l] - min)*id; + const float v1 = (x[qk/2 + l] - min)*id; + + const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f)); + const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f)); + + qd[l/8] |= vi0 << (8*(l & 7)); + qd[l/8] |= vi1 << (8*(l & 7) + 4); + } + + memcpy(qs, qd, qk/2); +} + #if !defined(__aarch64__) inline static uint16_t vaddvq_u8(uint8x16_t v) { @@ -863,19 +907,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r uint64_t qs[QK4_0 / 16] = {0}; - // pack first half of weights into low nibbles and second half into high nibbles - for (int l = 0; l < qk/2; ++l) { - const float v0 = x[i*qk + 0 + l]*id; - const float v1 = x[i*qk + qk/2 + l]*id; - - const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f)); - const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f)); - - qs[l/8] |= vi0 << (8*(l & 7)); - qs[l/8] |= vi1 << (8*(l & 7) + 4); - } - - memcpy(y[i].qs, qs, sizeof(qs)); + nibbles_from_floats_64_0(qk, x + i*qk, id, y[i].qs, qs); } } @@ -910,19 +942,7 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r uint64_t qs[QK4_1 / 16] = {0}; - // pack first half of weights into low nibbles and second half into high nibbles - for (int l = 0; l < qk/2; ++l) { - const float v0 = (x[i*qk + 0 + l] - min)*id; - const float v1 = (x[i*qk + qk/2 + l] - min)*id; - - const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f)); - const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f)); - - qs[l/8] |= vi0 << (8*(l & 7)); - qs[l/8] |= vi1 << (8*(l & 7) + 4); - } - - memcpy(y[i].qs, qs, sizeof(qs)); + nibbles_from_floats_64_1(qk, x + i*qk, id, min, y[i].qs, qs); } } @@ -1435,20 +1455,12 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict const int nb = k / qk; + uint64_t qs[QK4_0 / 8]; + for (int i = 0; i < nb; i++) { const float d = x[i].d; - // unpack nibbles into bytes - uint64_t qs[QK4_0 / 8] = {0}; - - memcpy(qs + 0, x[i].qs, sizeof(x[i].qs)); - - for (int l = 0; l < qk / 16; ++l) { - qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; - qs[l + 0 ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0; - } - - const uint8_t * restrict qsp = (const uint8_t * restrict) qs; + const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs); for (int l = 0; l < qk; ++l) { y[i*qk + l] = (qsp[l] - 8)*d; @@ -1464,21 +1476,13 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict const int nb = k / qk; + uint64_t qs[QK4_0 / 8]; + for (int i = 0; i < nb; i++) { const float d = x[i].d; const float m = x[i].m; - // unpack nibbles into bytes - uint64_t qs[QK4_0 / 8] = {0}; - - memcpy(qs + 0, x[i].qs, sizeof(x[i].qs)); - - for (int l = 0; l < qk / 16; ++l) { - qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; - qs[l + 0 ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0; - } - - const uint8_t * restrict qsp = (const uint8_t * restrict) qs; + const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs); for (int l = 0; l < qk; ++l) { y[i*qk + l] = qsp[l]*d + m; @@ -2410,19 +2414,12 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * // scalar float sumf = 0.0; + uint64_t qs[QK8_0 / 8]; + for (int i = 0; i < nb; i++) { // unpack nibbles into bytes - uint64_t qs[QK8_0 / 8] = {0}; - - memcpy(qs + 0, x[i].qs, sizeof(x[i].qs)); - - for (int l = 0; l < qk / 16; ++l) { - qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; - qs[l + 0 ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0; - } - - const uint8_t * restrict px = (const uint8_t * restrict) qs; - const int8_t * restrict py = y[i].qs; + const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs); + const int8_t * py = y[i].qs; int sumi = 0; @@ -2542,19 +2539,11 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * // scalar float sumf = 0.0; - for (int i = 0; i < nb; i++) { - // unpack nibbles into bytes - uint64_t qs[QK8_1 / 8] = {0}; - - memcpy(qs + 0, x[i].qs, sizeof(x[i].qs)); + uint64_t qs[QK8_1 / 8]; - for (int l = 0; l < qk / 16; ++l) { - qs[l + qk/16] = (qs[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; - qs[l + 0 ] = (qs[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0; - } - - const uint8_t * restrict px = (const uint8_t * restrict) qs; - const int8_t * restrict py = y[i].qs; + for (int i = 0; i < nb; i++) { + const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs); + const int8_t * py = y[i].qs; int sumi = 0; From 9f3285f74153ad50c6eef1b47a5650888ebd9e27 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 May 2023 22:07:40 +0300 Subject: [PATCH 04/32] ggml : remove Q4_2 bit shuffling (WIP, BROKEN) --- ggml.c | 117 +++++++++++++++++++++++---------------------------------- 1 file changed, 47 insertions(+), 70 deletions(-) diff --git a/ggml.c b/ggml.c index f165129eaad45..7a9a500cd987c 100644 --- a/ggml.c +++ b/ggml.c @@ -884,7 +884,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r static const int qk = QK4_0; assert(qk / 16 == 0); - assert(k % qk == 0); + assert( k % qk == 0); const int nb = k / qk; @@ -919,7 +919,7 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r const int qk = QK4_1; assert(qk / 16 == 0); - assert(k % qk == 0); + assert( k % qk == 0); const int nb = k / qk; @@ -952,48 +952,37 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k // reference implementation for deterministic creation of model files static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) { - assert(k % QK4_2 == 0); + static const int qk = QK4_2; - const int nb = k / QK4_2; + assert(qk / 16 == 0); + assert( k % qk == 0); + + const int nb = k / qk; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max - float max = 0.0f; + float max = 0.0f; - for (int l = 0; l < QK4_2; l++) { - const float v = x[i*QK4_2 + l]; + for (int l = 0; l < qk; l++) { + const float v = x[i*qk + l]; if (amax < fabsf(v)) { amax = fabsf(v); - max = v; + max = v; } } - const float d = max / -8; - + const float d = max / -8; const float id = d ? 1.0f/d : 0.0f; y[i].d = GGML_FP32_TO_FP16(d); - for (int l = 0; l < QK4_2; l += 2) { - const float v0 = x[i*QK4_2 + l + 0]*id; - const float v1 = x[i*QK4_2 + l + 1]*id; - - const uint8_t vi0 = MIN(15, (uint8_t)(v0 + 8.5f)); - const uint8_t vi1 = MIN(15, (uint8_t)(v1 + 8.5f)); + uint64_t qs[QK4_2 / 16] = {0}; - assert(vi0 < 16); - assert(vi1 < 16); - - y[i].qs[l/2] = vi0 | (vi1 << 4); - } + nibbles_from_floats_64_0(qk, x + i*qk, id, y[i].qs, qs); } } -static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) { - assert(k % QK4_2 == 0); - - block_q4_2 * restrict y = vy; - +static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k) { quantize_row_q4_2_reference(x, y, k); } @@ -1451,7 +1440,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict static const int qk = QK4_0; assert(qk / 16 == 0); - assert(k % qk == 0); + assert( k % qk == 0); const int nb = k / qk; @@ -1472,7 +1461,7 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict static const int qk = QK4_1; assert(qk / 16 == 0); - assert(k % qk == 0); + assert( k % qk == 0); const int nb = k / qk; @@ -1490,31 +1479,23 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict } } -static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, int k) { - assert(k % QK4_2 == 0); - const int nb = k / QK4_2; - - const block_q4_2 * restrict x = vx; - - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); +static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict y, int k) { + static const int qk = QK4_2; - const uint8_t * restrict pp = x[i].qs; + assert(qk / 16 == 0); + assert( k % qk == 0); - for (int l = 0; l < QK4_2; l += 2) { - const uint8_t vi = pp[l/2]; + const int nb = k / qk; - const int8_t vi0 = vi & 0x0F; - const int8_t vi1 = vi >> 4; + uint64_t qs[QK4_2 / 8]; - const float v0 = (vi0 - 8)*d; - const float v1 = (vi1 - 8)*d; + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); - y[i*QK4_2 + l + 0] = v0; - y[i*QK4_2 + l + 1] = v1; + const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs); - assert(!isnan(y[i*QK4_2 + l + 0])); - assert(!isnan(y[i*QK4_2 + l + 1])); + for (int l = 0; l < qk; ++l) { + y[i*qk + l] = (qsp[l] - 8)*d; } } } @@ -1634,7 +1615,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_1, }, [GGML_TYPE_Q4_2] = { - .dequantize_row_q = dequantize_row_q4_2, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_2, .quantize_row_q = quantize_row_q4_2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference, .quantize_row_q_dot = quantize_row_q8_0, @@ -2559,11 +2540,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * } static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK8_0; + const int qk = QK8_0; + const int nb = n / qk; - assert(n % QK8_0 == 0); + assert(n % qk == 0); assert(nb % 2 == 0); - assert(QK8_0 == 2*QK4_2); + + assert(qk == 2*QK4_2); const block_q4_2 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -2599,12 +2582,6 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); - // interleave - const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs); - const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs); - const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs); - const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs); - // load y const int8x16_t v1_0l = vld1q_s8(y0->qs); const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); @@ -2613,22 +2590,22 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)), - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d); + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)), + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hs, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d); sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)), - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d); + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)), + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hs, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d); #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); From aa78dfed7df3f33642e5c9a459c8be097d43aa9f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 May 2023 22:55:10 +0300 Subject: [PATCH 05/32] ggml : remove Q5_0 bit shuffling (ARM NEON) --- ggml.c | 198 ++++++++++++++++++++++++++------------------------------- 1 file changed, 91 insertions(+), 107 deletions(-) diff --git a/ggml.c b/ggml.c index 7a9a500cd987c..2a25afbbd1275 100644 --- a/ggml.c +++ b/ggml.c @@ -626,39 +626,6 @@ static inline const uint8_t * bytes_from_nibbles_64(const int qk, const uint8_t return (const uint8_t *) qd; } -// pack first half of weights into low nibbles and second half into high nibbles -// use one scaling factor -static inline void nibbles_from_floats_64_0(const int qk, const float * x, float id, uint8_t * qs, uint64_t * qd) { - for (int l = 0; l < qk/2; ++l) { - const float v0 = x[0 + l]*id; - const float v1 = x[qk/2 + l]*id; - - const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f)); - const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f)); - - qd[l/8] |= vi0 << (8*(l & 7)); - qd[l/8] |= vi1 << (8*(l & 7) + 4); - } - - memcpy(qs, qd, qk/2); -} - -// use offset and scaling factor -static inline void nibbles_from_floats_64_1(const int qk, const float * x, float id, float min, uint8_t * qs, uint64_t * qd) { - for (int l = 0; l < qk/2; ++l) { - const float v0 = (x[0 + l] - min)*id; - const float v1 = (x[qk/2 + l] - min)*id; - - const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f)); - const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f)); - - qd[l/8] |= vi0 << (8*(l & 7)); - qd[l/8] |= vi1 << (8*(l & 7) + 4); - } - - memcpy(qs, qd, qk/2); -} - #if !defined(__aarch64__) inline static uint16_t vaddvq_u8(uint8x16_t v) { @@ -907,7 +874,18 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r uint64_t qs[QK4_0 / 16] = {0}; - nibbles_from_floats_64_0(qk, x + i*qk, id, y[i].qs, qs); + for (int l = 0; l < qk/2; ++l) { + const float v0 = x[i*qk + 0 + l]*id; + const float v1 = x[i*qk + qk/2 + l]*id; + + const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f)); + const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f)); + + qs[l/8] |= vi0 << (8*(l & 7)); + qs[l/8] |= vi1 << (8*(l & 7) + 4); + } + + memcpy(y[i].qs, qs, qk/2); } } @@ -942,7 +920,18 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r uint64_t qs[QK4_1 / 16] = {0}; - nibbles_from_floats_64_1(qk, x + i*qk, id, min, y[i].qs, qs); + for (int l = 0; l < qk/2; ++l) { + const float v0 = (x[0 + l] - min)*id; + const float v1 = (x[qk/2 + l] - min)*id; + + const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f)); + const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f)); + + qs[l/8] |= vi0 << (8*(l & 7)); + qs[l/8] |= vi1 << (8*(l & 7) + 4); + } + + memcpy(y[i].qs, qs, qk/2); } } @@ -978,7 +967,18 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r uint64_t qs[QK4_2 / 16] = {0}; - nibbles_from_floats_64_0(qk, x + i*qk, id, y[i].qs, qs); + for (int l = 0; l < qk/2; ++l) { + const float v0 = x[i*qk + 0 + l]*id; + const float v1 = x[i*qk + qk/2 + l]*id; + + const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f)); + const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f)); + + qs[l/8] |= vi0 << (8*(l & 7)); + qs[l/8] |= vi1 << (8*(l & 7) + 4); + } + + memcpy(y[i].qs, qs, qk/2); } } @@ -987,51 +987,54 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k } static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { - assert(k % QK5_0 == 0); - const int nb = k / QK5_0; + static const int qk = QK5_0; + + assert(qk / 16 == 0); + assert( k % qk == 0); + + const int nb = k / qk; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max - float max = 0.0f; + float max = 0.0f; - for (int l = 0; l < QK5_0; l++) { - const float v = x[i*QK5_0 + l]; + for (int l = 0; l < qk; l++) { + const float v = x[i*qk + l]; if (amax < fabsf(v)) { amax = fabsf(v); - max = v; + max = v; } } - const float d = max / -16; + const float d = max / -16; const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = d; uint32_t qh = 0; + uint64_t qs[QK5_0 / 16] = {0}; - for (int l = 0; l < QK5_0; l += 2) { - const float v0 = x[i*QK5_0 + l + 0]*id; - const float v1 = x[i*QK5_0 + l + 1]*id; + for (int l = 0; l < qk/2; ++l) { + const float v0 = x[i*qk + 0 + l]*id; + const float v1 = x[i*qk + qk/2 + l]*id; - const uint32_t vi0 = MIN(31, (int) (v0 + 16.5f)); - const uint32_t vi1 = MIN(31, (int) (v1 + 16.5f)); + const uint64_t vi0 = MIN(31, (int8_t)(v0 + 16.5f)); + const uint64_t vi1 = MIN(31, (int8_t)(v1 + 16.5f)); - y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4); + qs[l/8] |= vi0 << (8*(l & 7)); + qs[l/8] |= vi1 << (8*(l & 7) + 4); // get the 5-th bit and store it in qh at the right position qh |= ((vi0 & 0x10) >> 4) << (l + 0); - qh |= ((vi1 & 0x10) >> 4) << (l + 1); + qh |= ((vi1 & 0x10) >> 4) << (l + qk/2); } - memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); + memcpy( y[i].qs, qs, qk/2); + memcpy(&y[i].qh, &qh, sizeof(qh)); } } -static void quantize_row_q5_0(const float * restrict x, void * restrict vy, int k) { - assert(k % QK5_0 == 0); - - block_q5_0 * restrict y = vy; - +static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { quantize_row_q5_0_reference(x, y, k); } @@ -1500,38 +1503,28 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict } } -static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) { - assert(k % QK5_0 == 0); - const int nb = k / QK5_0; +static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { + static const int qk = QK4_0; - const block_q5_0 * restrict x = vx; + assert(qk / 16 == 0); + assert( k % qk == 0); + + const int nb = k / qk; + + uint64_t qs[QK5_0 / 8]; for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict pp = x[i].qs; - uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); - for (int l = 0; l < QK5_0; l += 2) { - const uint8_t vi = pp[l/2]; - - // extract the 5-th bit from qh - const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; - - const int8_t vi0 = (vi & 0x0F) | vh0; - const int8_t vi1 = (vi >> 4) | vh1; - - const float v0 = (vi0 - 16)*d; - const float v1 = (vi1 - 16)*d; + const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs); - y[i*QK5_0 + l + 0] = v0; - y[i*QK5_0 + l + 1] = v1; + for (int l = 0; l < qk; ++l) { + const uint8_t vh = ((qh & (1u << l)) >> l) << 4; - assert(!isnan(y[i*QK5_0 + l + 0])); - assert(!isnan(y[i*QK5_0 + l + 1])); + y[i*qk + l] = ((qsp[l] | vh) - 16)*d; } } } @@ -1623,7 +1616,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q5_0] = { - .dequantize_row_q = dequantize_row_q5_0, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0, .quantize_row_q = quantize_row_q5_0, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference, .quantize_row_q_dot = quantize_row_q8_0, @@ -2693,11 +2686,12 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * } static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK8_0; + const int qk = QK8_0; + const int nb = n / qk; - assert(n % QK8_0 == 0); + assert(n % qk == 0); assert(nb % 2 == 0); - assert(QK8_0 == QK5_0); + assert(qk == QK5_0); const block_q5_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -2732,13 +2726,9 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8 (v0, m4b)); const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4)); - // interleave - const int8x16_t v0lz = vzip1q_s8(v0l, v0h); - const int8x16_t v0hz = vzip2q_s8(v0l, v0h); - // add high bit and sub 16 - const int8x16_t v0lf = vsubq_s8(vorrq_s8(v0lz, qhl), s16b); - const int8x16_t v0hf = vsubq_s8(vorrq_s8(v0hz, qhh), s16b); + const int8x16_t v0lf = vsubq_s8(vorrq_s8(v0l, qhl), s16b); + const int8x16_t v0hf = vsubq_s8(vorrq_s8(v0h, qhh), s16b); // load y const int8x16_t v1l = vld1q_s8(y0->qs); @@ -2856,34 +2846,28 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * #else // scalar float sumf = 0.0; + + uint64_t qs[QK8_0 / 8]; + for (int i = 0; i < nb; i++) { - const uint8_t * restrict x0 = x[i].qs; - const int8_t * restrict y0 = y[i].qs; + // unpack nibbles into bytes + const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs); + const int8_t * py = y[i].qs; uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); - const float d = GGML_FP16_TO_FP32(x[i].d); - - int sxy = 0; - - for (int j = 0; j < QK8_0/2; j++) { - const uint8_t v0 = x0[j]; - - const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4; - const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4; - - const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16; - const int x1_0 = ((v0 >> 4) | x1_0h) - 16; + int sumi = 0; - const int y0_0 = y0[2*j + 0]; - const int y1_0 = y0[2*j + 1]; + for (int j = 0; j < qk; ++j) { + const int xh = ((qh & (1u << j)) >> j) << 4; - sxy += x0_0*y0_0 + x1_0*y1_0; + sumi += ((px[j] | xh) - 16)*py[j]; } - sumf += (d*sxy)*y[i].d; + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi; } + *s = sumf; #endif } From b37a08f6468c454110a3e844fe0978672c213df9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 May 2023 23:31:35 +0300 Subject: [PATCH 06/32] ggml : 2x faster scalar implementations --- ggml.c | 132 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 67 insertions(+), 65 deletions(-) diff --git a/ggml.c b/ggml.c index 2a25afbbd1275..98e7a1fdd6894 100644 --- a/ggml.c +++ b/ggml.c @@ -615,7 +615,8 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) #if __ARM_NEON -static inline const uint8_t * bytes_from_nibbles_64(const int qk, const uint8_t * qs, uint64_t * qd) { +// TODO: obosolete - will be removed +static inline const uint8_t * b4_from_nibbles_64(const int qk, const uint8_t * qs, uint64_t * qd) { memcpy(qd, qs, qk/2); for (int l = 0; l < qk/16; ++l) { @@ -875,14 +876,14 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r uint64_t qs[QK4_0 / 16] = {0}; for (int l = 0; l < qk/2; ++l) { - const float v0 = x[i*qk + 0 + l]*id; - const float v1 = x[i*qk + qk/2 + l]*id; + const float x0 = x[i*qk + 0 + l]*id; + const float x1 = x[i*qk + qk/2 + l]*id; - const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f)); - const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f)); + const uint64_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint64_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - qs[l/8] |= vi0 << (8*(l & 7)); - qs[l/8] |= vi1 << (8*(l & 7) + 4); + qs[l/8] |= xi0 << (8*(l & 7)); + qs[l/8] |= xi1 << (8*(l & 7) + 4); } memcpy(y[i].qs, qs, qk/2); @@ -921,14 +922,14 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r uint64_t qs[QK4_1 / 16] = {0}; for (int l = 0; l < qk/2; ++l) { - const float v0 = (x[0 + l] - min)*id; - const float v1 = (x[qk/2 + l] - min)*id; + const float x0 = (x[0 + l] - min)*id; + const float x1 = (x[qk/2 + l] - min)*id; - const uint64_t vi0 = MIN(15, (int8_t)(v0 + 0.5f)); - const uint64_t vi1 = MIN(15, (int8_t)(v1 + 0.5f)); + const uint64_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint64_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); - qs[l/8] |= vi0 << (8*(l & 7)); - qs[l/8] |= vi1 << (8*(l & 7) + 4); + qs[l/8] |= xi0 << (8*(l & 7)); + qs[l/8] |= xi1 << (8*(l & 7) + 4); } memcpy(y[i].qs, qs, qk/2); @@ -968,14 +969,14 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r uint64_t qs[QK4_2 / 16] = {0}; for (int l = 0; l < qk/2; ++l) { - const float v0 = x[i*qk + 0 + l]*id; - const float v1 = x[i*qk + qk/2 + l]*id; + const float x0 = x[i*qk + 0 + l]*id; + const float x1 = x[i*qk + qk/2 + l]*id; - const uint64_t vi0 = MIN(15, (int8_t)(v0 + 8.5f)); - const uint64_t vi1 = MIN(15, (int8_t)(v1 + 8.5f)); + const uint64_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint64_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - qs[l/8] |= vi0 << (8*(l & 7)); - qs[l/8] |= vi1 << (8*(l & 7) + 4); + qs[l/8] |= xi0 << (8*(l & 7)); + qs[l/8] |= xi1 << (8*(l & 7) + 4); } memcpy(y[i].qs, qs, qk/2); @@ -1015,18 +1016,18 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r uint64_t qs[QK5_0 / 16] = {0}; for (int l = 0; l < qk/2; ++l) { - const float v0 = x[i*qk + 0 + l]*id; - const float v1 = x[i*qk + qk/2 + l]*id; + const float x0 = x[i*qk + 0 + l]*id; + const float x1 = x[i*qk + qk/2 + l]*id; - const uint64_t vi0 = MIN(31, (int8_t)(v0 + 16.5f)); - const uint64_t vi1 = MIN(31, (int8_t)(v1 + 16.5f)); + const uint64_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); + const uint64_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); - qs[l/8] |= vi0 << (8*(l & 7)); - qs[l/8] |= vi1 << (8*(l & 7) + 4); + qs[l/8] |= xi0 << (8*(l & 7)); + qs[l/8] |= xi1 << (8*(l & 7) + 4); // get the 5-th bit and store it in qh at the right position - qh |= ((vi0 & 0x10) >> 4) << (l + 0); - qh |= ((vi1 & 0x10) >> 4) << (l + qk/2); + qh |= ((xi0 & 0x10) >> 4) << (l + 0); + qh |= ((xi1 & 0x10) >> 4) << (l + qk/2); } memcpy( y[i].qs, qs, qk/2); @@ -1447,15 +1448,15 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict const int nb = k / qk; - uint64_t qs[QK4_0 / 8]; - for (int i = 0; i < nb; i++) { const float d = x[i].d; - const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs); + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0xf) - 8; + const int x1 = (x[i].qs[j] >> 4) - 8; - for (int l = 0; l < qk; ++l) { - y[i*qk + l] = (qsp[l] - 8)*d; + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; } } } @@ -1468,21 +1469,22 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict const int nb = k / qk; - uint64_t qs[QK4_0 / 8]; - for (int i = 0; i < nb; i++) { const float d = x[i].d; const float m = x[i].m; - const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs); + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0xf); + const int x1 = (x[i].qs[j] >> 4); - for (int l = 0; l < qk; ++l) { - y[i*qk + l] = qsp[l]*d + m; + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; } } } static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict y, int k) { + // BORKEN !!! static const int qk = QK4_2; assert(qk / 16 == 0); @@ -1495,7 +1497,7 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs); + const uint8_t * qsp = b4_from_nibbles_64(qk, x[i].qs, qs); for (int l = 0; l < qk; ++l) { y[i*qk + l] = (qsp[l] - 8)*d; @@ -1511,20 +1513,21 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict const int nb = k / qk; - uint64_t qs[QK5_0 / 8]; - for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); - const uint8_t * qsp = bytes_from_nibbles_64(qk, x[i].qs, qs); + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - for (int l = 0; l < qk; ++l) { - const uint8_t vh = ((qh & (1u << l)) >> l) << 4; + const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - y[i*qk + l] = ((qsp[l] | vh) - 16)*d; + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; } } } @@ -2388,17 +2391,16 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * // scalar float sumf = 0.0; - uint64_t qs[QK8_0 / 8]; - for (int i = 0; i < nb; i++) { - // unpack nibbles into bytes - const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs); - const int8_t * py = y[i].qs; + const int8_t * py = y[i].qs; int sumi = 0; - for (int j = 0; j < qk; ++j) { - sumi += (px[j] - 8) * py[j]; + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0xf) - 8; + const int v1 = (x[i].qs[j] >> 4) - 8; + + sumi += (v0 * py[j]) + (v1 * py[j + qk/2]); } sumf += (x[i].d*y[i].d)*sumi; @@ -2513,16 +2515,16 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * // scalar float sumf = 0.0; - uint64_t qs[QK8_1 / 8]; - for (int i = 0; i < nb; i++) { - const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs); - const int8_t * py = y[i].qs; + const int8_t * py = y[i].qs; int sumi = 0; - for (int j = 0; j < qk; ++j) { - sumi += px[j]*py[j]; + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0xf); + const int v1 = (x[i].qs[j] >> 4); + + sumi += (v0 * py[j]) + (v1 * py[j + qk/2]); } sumf += (x[i].d*y[i].d)*sumi + x[i].m*(y[i].s0 + y[i].s1); @@ -2847,22 +2849,22 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * // scalar float sumf = 0.0; - uint64_t qs[QK8_0 / 8]; - for (int i = 0; i < nb; i++) { - // unpack nibbles into bytes - const uint8_t * px = bytes_from_nibbles_64(qk, x[i].qs, qs); - const int8_t * py = y[i].qs; + const int8_t * py = y[i].qs; uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); int sumi = 0; - for (int j = 0; j < qk; ++j) { - const int xh = ((qh & (1u << j)) >> j) << 4; + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - sumi += ((px[j] | xh) - 16)*py[j]; + sumi += (x0 * py[j]) + (x1 * py[j + qk/2]); } sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi; From 292a778ca2246c385047c752fe1e0fa5c9c564e3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 May 2023 17:09:11 +0300 Subject: [PATCH 07/32] ggml : remove Q5_1 bit shuffling (ARM NEON + scalar) --- ggml.c | 172 ++++++++++++++++++++++----------------------------------- 1 file changed, 66 insertions(+), 106 deletions(-) diff --git a/ggml.c b/ggml.c index 98e7a1fdd6894..cb321455b4dd2 100644 --- a/ggml.c +++ b/ggml.c @@ -851,8 +851,7 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { static const int qk = QK4_0; - assert(qk / 16 == 0); - assert( k % qk == 0); + assert(k % qk == 0); const int nb = k / qk; @@ -873,20 +872,16 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r y[i].d = d; - uint64_t qs[QK4_0 / 16] = {0}; - for (int l = 0; l < qk/2; ++l) { const float x0 = x[i*qk + 0 + l]*id; const float x1 = x[i*qk + qk/2 + l]*id; - const uint64_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); - const uint64_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - qs[l/8] |= xi0 << (8*(l & 7)); - qs[l/8] |= xi1 << (8*(l & 7) + 4); + y[i].qs[l] = xi0; + y[i].qs[l] |= xi1 << 4; } - - memcpy(y[i].qs, qs, qk/2); } } @@ -897,8 +892,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { const int qk = QK4_1; - assert(qk / 16 == 0); - assert( k % qk == 0); + assert(k % qk == 0); const int nb = k / qk; @@ -919,20 +913,16 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r y[i].d = d; y[i].m = min; - uint64_t qs[QK4_1 / 16] = {0}; - for (int l = 0; l < qk/2; ++l) { const float x0 = (x[0 + l] - min)*id; const float x1 = (x[qk/2 + l] - min)*id; - const uint64_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); - const uint64_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); - qs[l/8] |= xi0 << (8*(l & 7)); - qs[l/8] |= xi1 << (8*(l & 7) + 4); + y[i].qs[l] = xi0; + y[i].qs[l] |= xi1 << 4; } - - memcpy(y[i].qs, qs, qk/2); } } @@ -944,8 +934,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) { static const int qk = QK4_2; - assert(qk / 16 == 0); - assert( k % qk == 0); + assert(k % qk == 0); const int nb = k / qk; @@ -990,8 +979,7 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { static const int qk = QK5_0; - assert(qk / 16 == 0); - assert( k % qk == 0); + assert(k % qk == 0); const int nb = k / qk; @@ -1013,24 +1001,21 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r y[i].d = d; uint32_t qh = 0; - uint64_t qs[QK5_0 / 16] = {0}; for (int l = 0; l < qk/2; ++l) { const float x0 = x[i*qk + 0 + l]*id; const float x1 = x[i*qk + qk/2 + l]*id; - const uint64_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); - const uint64_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); + const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); + const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); - qs[l/8] |= xi0 << (8*(l & 7)); - qs[l/8] |= xi1 << (8*(l & 7) + 4); + y[i].qs[l] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); // get the 5-th bit and store it in qh at the right position qh |= ((xi0 & 0x10) >> 4) << (l + 0); qh |= ((xi1 & 0x10) >> 4) << (l + qk/2); } - memcpy( y[i].qs, qs, qk/2); memcpy(&y[i].qh, &qh, sizeof(qh)); } } @@ -1040,20 +1025,24 @@ static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k } static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { - assert(k % QK5_1 == 0); - const int nb = k / QK5_1; + const int qk = QK5_1; + + assert(k % qk == 0); + + const int nb = k / qk; for (int i = 0; i < nb; i++) { float min = FLT_MAX; float max = -FLT_MAX; - for (int l = 0; l < QK5_1; l++) { - const float v = x[i*QK5_1 + l]; + for (int l = 0; l < qk; l++) { + const float v = x[i*qk + l]; + if (v < min) min = v; if (v > max) max = v; } - const float d = (max - min) / ((1 << 5) - 1); + const float d = (max - min) / ((1 << 5) - 1); const float id = d ? 1.0f/d : 0.0f; y[i].d = GGML_FP32_TO_FP16(d); @@ -1061,29 +1050,25 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r uint32_t qh = 0; - for (int l = 0; l < QK5_1; l += 2) { - const float v0 = (x[i*QK5_1 + l + 0] - min)*id; - const float v1 = (x[i*QK5_1 + l + 1] - min)*id; + for (int l = 0; l < qk/2; ++l) { + const float x0 = (x[i*qk + 0 + l] - min)*id; + const float x1 = (x[i*qk + qk/2 + l] - min)*id; - const uint32_t vi0 = (int) (v0 + 0.5f); - const uint32_t vi1 = (int) (v1 + 0.5f); + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); - y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4); + y[i].qs[l] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); // get the 5-th bit and store it in qh at the right position - qh |= ((vi0 & 0x10) >> 4) << (l + 0); - qh |= ((vi1 & 0x10) >> 4) << (l + 1); + qh |= ((xi0 & 0x10) >> 4) << (l + 0); + qh |= ((xi1 & 0x10) >> 4) << (l + qk/2); } memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); } } -static void quantize_row_q5_1(const float * restrict x, void * restrict vy, int k) { - assert(k % QK5_1 == 0); - - block_q5_1 * restrict y = vy; - +static void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { quantize_row_q5_1_reference(x, y, k); } @@ -1443,8 +1428,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { static const int qk = QK4_0; - assert(qk / 16 == 0); - assert( k % qk == 0); + assert(k % qk == 0); const int nb = k / qk; @@ -1464,8 +1448,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { static const int qk = QK4_1; - assert(qk / 16 == 0); - assert( k % qk == 0); + assert(k % qk == 0); const int nb = k / qk; @@ -1487,8 +1470,7 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict // BORKEN !!! static const int qk = QK4_2; - assert(qk / 16 == 0); - assert( k % qk == 0); + assert(k % qk == 0); const int nb = k / qk; @@ -1508,8 +1490,7 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { static const int qk = QK4_0; - assert(qk / 16 == 0); - assert( k % qk == 0); + assert(k % qk == 0); const int nb = k / qk; @@ -1532,39 +1513,29 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict } } -static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, int k) { - assert(k % QK5_1 == 0); - const int nb = k / QK5_1; +static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { + static const int qk = QK5_1; - const block_q5_1 * restrict x = vx; + assert(k % qk == 0); + + const int nb = k / qk; for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); const float m = GGML_FP16_TO_FP32(x[i].m); - const uint8_t * restrict pp = x[i].qs; - uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); - for (int l = 0; l < QK5_1; l += 2) { - const uint8_t vi = pp[l/2]; - - // extract the 5-th bit from qh - const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; - - const uint8_t vi0 = (vi & 0x0F) | vh0; - const uint8_t vi1 = (vi >> 4) | vh1; - - const float v0 = vi0*d + m; - const float v1 = vi1*d + m; + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - y[i*QK5_1 + l + 0] = v0; - y[i*QK5_1 + l + 1] = v1; + const int x0 = (x[i].qs[j] & 0xf) | xh_0; + const int x1 = (x[i].qs[j] >> 4) | xh_1; - assert(!isnan(y[i*QK5_1 + l + 0])); - assert(!isnan(y[i*QK5_1 + l + 1])); + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; } } } @@ -1627,7 +1598,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q5_1] = { - .dequantize_row_q = dequantize_row_q5_1, + .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_1, .quantize_row_q = quantize_row_q5_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference, .quantize_row_q_dot = quantize_row_q8_1, @@ -2875,11 +2846,12 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * } static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK8_1; + const int qk = QK8_1; + const int nb = n / qk; - assert(n % QK8_1 == 0); + assert(n % qk == 0); assert(nb % 2 == 0); - assert(QK8_1 == QK5_1); + assert(qk == QK5_1); const block_q5_1 * restrict x = vx; const block_q8_1 * restrict y = vy; @@ -2915,13 +2887,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8 (v0, vdupq_n_u8(0x0F))); const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4)); - // interleave - const int8x16_t v0lz = vzip1q_s8(v0l, v0h); - const int8x16_t v0hz = vzip2q_s8(v0l, v0h); - // add - const int8x16_t v0lf = vorrq_s8(v0lz, qhl); - const int8x16_t v0hf = vorrq_s8(v0hz, qhh); + const int8x16_t v0lf = vorrq_s8(v0l, qhl); + const int8x16_t v0hf = vorrq_s8(v0h, qhh); // load y const int8x16_t v1l = vld1q_s8(y0->qs); @@ -3044,36 +3012,28 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * *s = hsum_float_8(acc) + summs; #else + // scalar float sumf = 0.0; for (int i = 0; i < nb; i++) { - const uint8_t * restrict x0 = x[i].qs; - const int8_t * restrict y0 = y[i].qs; + const int8_t * py = y[i].qs; uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); - const float d = GGML_FP16_TO_FP32(x[i].d); - const float m = GGML_FP16_TO_FP32(x[i].m); - - int sxy = 0; - - for (int j = 0; j < QK8_1/2; j++) { - const uint8_t v0 = x0[j]; - - const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4; - const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4; + int sumi = 0; - const int x0_0 = (v0 & 0x0F) | x0_0h; - const int x1_0 = (v0 >> 4) | x1_0h; + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - const int y0_0 = y0[2*j + 0]; - const int y1_0 = y0[2*j + 1]; + const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; - sxy += x0_0*y0_0 + x1_0*y1_0; + sumi += (x0 * py[j]) + (x1 * py[j + qk/2]); } - sumf += (d*sxy)*y[i].d + m*(y[i].s0 + y[i].s1); + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*(y[i].s0 + y[i].s1); } *s = sumf; From caaacd576552db0a76cd89ca18b0940e80302ead Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 May 2023 17:12:58 +0300 Subject: [PATCH 08/32] ggml : simplify scalar dot --- ggml.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/ggml.c b/ggml.c index cb321455b4dd2..1e23d3ec83977 100644 --- a/ggml.c +++ b/ggml.c @@ -2363,15 +2363,13 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * float sumf = 0.0; for (int i = 0; i < nb; i++) { - const int8_t * py = y[i].qs; - int sumi = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[i].qs[j] & 0xf) - 8; const int v1 = (x[i].qs[j] >> 4) - 8; - sumi += (v0 * py[j]) + (v1 * py[j + qk/2]); + sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } sumf += (x[i].d*y[i].d)*sumi; @@ -2487,15 +2485,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * float sumf = 0.0; for (int i = 0; i < nb; i++) { - const int8_t * py = y[i].qs; - int sumi = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[i].qs[j] & 0xf); const int v1 = (x[i].qs[j] >> 4); - sumi += (v0 * py[j]) + (v1 * py[j + qk/2]); + sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } sumf += (x[i].d*y[i].d)*sumi + x[i].m*(y[i].s0 + y[i].s1); @@ -2821,8 +2817,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * float sumf = 0.0; for (int i = 0; i < nb; i++) { - const int8_t * py = y[i].qs; - uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); @@ -2835,7 +2829,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - sumi += (x0 * py[j]) + (x1 * py[j + qk/2]); + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi; @@ -3016,8 +3010,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * float sumf = 0.0; for (int i = 0; i < nb; i++) { - const int8_t * py = y[i].qs; - uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); @@ -3030,7 +3022,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; - sumi += (x0 * py[j]) + (x1 * py[j + qk/2]); + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*(y[i].s0 + y[i].s1); From 0add6402bdbc825c27a101730756a7cb037da5ff Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 May 2023 17:23:41 +0300 Subject: [PATCH 09/32] ggml : remove WASM SIMD bit shuffling + remove vzip for ARM 32-bit --- ggml.c | 120 +++++---------------------------------------------------- 1 file changed, 9 insertions(+), 111 deletions(-) diff --git a/ggml.c b/ggml.c index 1e23d3ec83977..ed6e4742bc8b8 100644 --- a/ggml.c +++ b/ggml.c @@ -689,94 +689,6 @@ float vmaxvq_f32(float32x4_t v) { MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); } -int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) { - int8x8_t res; - - res[0] = a[0]; res[1] = b[0]; - res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; - res[6] = a[3]; res[7] = b[3]; - - return res; -} - -int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) { - int8x8_t res; - - res[0] = a[4]; res[1] = b[4]; - res[2] = a[5]; res[3] = b[5]; - res[4] = a[6]; res[5] = b[6]; - res[6] = a[7]; res[7] = b[7]; - - return res; -} - -uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[0]; res[1] = b[0]; - res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; - res[6] = a[3]; res[7] = b[3]; - - return res; -} - -uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[4]; res[1] = b[4]; - res[2] = a[5]; res[3] = b[5]; - res[4] = a[6]; res[5] = b[6]; - res[6] = a[7]; res[7] = b[7]; - - return res; -} - -int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) { - int8x16_t res; - - res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3]; - res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5]; - res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7]; - - return res; -} - -int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) { - int8x16_t res; - - res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9]; - res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11]; - res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13]; - res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15]; - - return res; -} - -uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) { - uint8x16_t res; - - res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3]; - res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5]; - res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7]; - - return res; -} - -uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) { - uint8x16_t res; - - res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9]; - res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11]; - res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13]; - res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15]; - - return res; -} - int32x4_t vcvtnq_s32_f32(float32x4_t v) { int32x4_t res; @@ -2753,13 +2665,9 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const v128_t v0l = wasm_v128_and (v0, m4b); const v128_t v0h = wasm_u8x16_shr(v0, 4); - // interleave - const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); - // add high bit and sub 16 - const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b); - const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b); + const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0l, qhl), s16b); + const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0h, qhh), s16b); // load y const v128_t v1l = wasm_v128_load(y0->qs); @@ -2944,13 +2852,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * static bool x = true; - // interleave - const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); - // add high bit - const v128_t v0lf = wasm_v128_or(v0lz, qhl); - const v128_t v0hf = wasm_v128_or(v0hz, qhh); + const v128_t v0lf = wasm_v128_or(v0l, qhl); + const v128_t v0hf = wasm_v128_or(v0h, qhh); // load y const v128_t v1l = wasm_v128_load(y0->qs); @@ -3033,11 +2937,11 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * } static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK8_0; + const int qk = QK8_0; + const int nb = n / qk; - assert(n % QK8_0 == 0); + assert(n % qk == 0); assert(nb % 2 == 0); - assert(QK8_0 == QK8_0); const block_q8_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -3117,16 +3021,10 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * float sumf = 0.0; for (int i = 0; i < nb; i++) { - const int8_t * restrict x0 = x[i].qs; - const int8_t * restrict y0 = y[i].qs; - int sumi = 0; - for (int j = 0; j < QK8_0; j++) { - const int v0 = x0[j]; - const int v1 = y0[j]; - - sumi += v0*v1; + for (int j = 0; j < qk; j++) { + sumi += x[i].qs[j]*y[i].qs[j]; } sumf += (x[i].d*y[i].d)*sumi; From 9472d0ea8bcee2a78178bd0070c20f8824c7378d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 May 2023 18:07:11 +0300 Subject: [PATCH 10/32] ggml : fix Q4_1 quantization --- ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index ed6e4742bc8b8..5d00404890822 100644 --- a/ggml.c +++ b/ggml.c @@ -826,8 +826,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r y[i].m = min; for (int l = 0; l < qk/2; ++l) { - const float x0 = (x[0 + l] - min)*id; - const float x1 = (x[qk/2 + l] - min)*id; + const float x0 = (x[i*qk + 0 + l] - min)*id; + const float x1 = (x[i*qk + qk/2 + l] - min)*id; const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); From cdc960732900163fab4ce496fca5c6d2c687fe5c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 May 2023 18:23:59 +0300 Subject: [PATCH 11/32] ggml : update cuBLAS + normalize variable names --- ggml-cuda.cu | 84 +++++++------------ ggml.c | 233 ++++++++++++++++++++++++++------------------------- 2 files changed, 149 insertions(+), 168 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 127b352a0f2c9..b1a9ffb3258a9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -81,29 +81,26 @@ typedef struct { static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); static __global__ void dequantize_block_q4_0(const void * vx, float * y) { + static const int qk = QK4_0; + const block_q4_0 * x = (const block_q4_0 *) vx; const int i = blockIdx.x; const float d = x[i].d; - const uint8_t * pp = x[i].qs; - - for (int l = 0; l < QK4_0; l += 2) { - const uint8_t vi = pp[l/2]; - - const int8_t vi0 = vi & 0xf; - const int8_t vi1 = vi >> 4; + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0xf) - 8; + const int x1 = (x[i].qs[j] >> 4) - 8; - const float v0 = (vi0 - 8)*d; - const float v1 = (vi1 - 8)*d; - - y[i*QK4_0 + l + 0] = v0; - y[i*QK4_0 + l + 1] = v1; + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; } } static __global__ void dequantize_block_q4_1(const void * vx, float * y) { + static const int qk = QK4_1; + const block_q4_1 * x = (const block_q4_1 *) vx; const int i = blockIdx.x; @@ -111,19 +108,12 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) { const float d = x[i].d; const float m = x[i].m; - const uint8_t * pp = x[i].qs; - - for (int l = 0; l < QK4_1; l += 2) { - const uint8_t vi = pp[l/2]; - - const int8_t vi0 = vi & 0xf; - const int8_t vi1 = vi >> 4; + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0xf); + const int x1 = (x[i].qs[j] >> 4); - const float v0 = vi0*d + m; - const float v1 = vi1*d + m; - - y[i*QK4_1 + l + 0] = v0; - y[i*QK4_1 + l + 1] = v1; + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; } } @@ -151,35 +141,32 @@ static __global__ void dequantize_block_q4_2(const void * vx, float * y) { } static __global__ void dequantize_block_q5_0(const void * vx, float * y) { + static const int qk = QK5_0; + const block_q5_0 * x = (const block_q5_0 *) vx; const int i = blockIdx.x; const float d = x[i].d; - const uint8_t * pp = x[i].qs; - uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); - for (int l = 0; l < QK5_0; l += 2) { - const uint8_t vi = pp[l/2]; - - const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4; - const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4; + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - const int8_t vi0 = ((vi & 0xf) | vh0); - const int8_t vi1 = ((vi >> 4) | vh1); + const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - const float v0 = (vi0 - 16)*d; - const float v1 = (vi1 - 16)*d; - - y[i*QK5_0 + l + 0] = v0; - y[i*QK5_0 + l + 1] = v1; + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; } } static __global__ void dequantize_block_q5_1(const void * vx, float * y) { + static const int qk = QK5_1; + const block_q5_1 * x = (const block_q5_1 *) vx; const int i = blockIdx.x; @@ -187,25 +174,18 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) { const float d = x[i].d; const float m = x[i].m; - const uint8_t * pp = x[i].qs; - uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); - for (int l = 0; l < QK5_1; l += 2) { - const uint8_t vi = pp[l/2]; - - const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4; - const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4; - - const int8_t vi0 = (vi & 0xf) | vh0; - const int8_t vi1 = (vi >> 4) | vh1; + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - const float v0 = vi0*d + m; - const float v1 = vi1*d + m; + const int x0 = (x[i].qs[j] & 0xf) | xh_0; + const int x1 = (x[i].qs[j] >> 4) | xh_1; - y[i*QK5_1 + l + 0] = v0; - y[i*QK5_1 + l + 1] = v1; + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; } } diff --git a/ggml.c b/ggml.c index 5d00404890822..817782179a9d3 100644 --- a/ggml.c +++ b/ggml.c @@ -771,8 +771,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r float amax = 0.0f; // absolute max float max = 0.0f; - for (int l = 0; l < qk; l++) { - const float v = x[i*qk + l]; + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; if (amax < fabsf(v)) { amax = fabsf(v); max = v; @@ -784,15 +784,15 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r y[i].d = d; - for (int l = 0; l < qk/2; ++l) { - const float x0 = x[i*qk + 0 + l]*id; - const float x1 = x[i*qk + qk/2 + l]*id; + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - y[i].qs[l] = xi0; - y[i].qs[l] |= xi1 << 4; + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; } } } @@ -812,8 +812,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r float min = FLT_MAX; float max = -FLT_MAX; - for (int l = 0; l < qk; l++) { - const float v = x[i*qk + l]; + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; if (v < min) min = v; if (v > max) max = v; @@ -825,15 +825,15 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r y[i].d = d; y[i].m = min; - for (int l = 0; l < qk/2; ++l) { - const float x0 = (x[i*qk + 0 + l] - min)*id; - const float x1 = (x[i*qk + qk/2 + l] - min)*id; + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); - y[i].qs[l] = xi0; - y[i].qs[l] |= xi1 << 4; + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; } } } @@ -854,8 +854,8 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r float amax = 0.0f; // absolute max float max = 0.0f; - for (int l = 0; l < qk; l++) { - const float v = x[i*qk + l]; + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; if (amax < fabsf(v)) { amax = fabsf(v); max = v; @@ -869,15 +869,15 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r uint64_t qs[QK4_2 / 16] = {0}; - for (int l = 0; l < qk/2; ++l) { - const float x0 = x[i*qk + 0 + l]*id; - const float x1 = x[i*qk + qk/2 + l]*id; + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; const uint64_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); const uint64_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - qs[l/8] |= xi0 << (8*(l & 7)); - qs[l/8] |= xi1 << (8*(l & 7) + 4); + qs[j/8] |= xi0 << (8*(j & 7)); + qs[j/8] |= xi1 << (8*(j & 7) + 4); } memcpy(y[i].qs, qs, qk/2); @@ -899,8 +899,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r float amax = 0.0f; // absolute max float max = 0.0f; - for (int l = 0; l < qk; l++) { - const float v = x[i*qk + l]; + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; if (amax < fabsf(v)) { amax = fabsf(v); max = v; @@ -914,18 +914,18 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r uint32_t qh = 0; - for (int l = 0; l < qk/2; ++l) { - const float x0 = x[i*qk + 0 + l]*id; - const float x1 = x[i*qk + qk/2 + l]*id; + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); - y[i].qs[l] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); // get the 5-th bit and store it in qh at the right position - qh |= ((xi0 & 0x10) >> 4) << (l + 0); - qh |= ((xi1 & 0x10) >> 4) << (l + qk/2); + qh |= ((xi0 & 0x10) >> 4) << (j + 0); + qh |= ((xi1 & 0x10) >> 4) << (j + qk/2); } memcpy(&y[i].qh, &qh, sizeof(qh)); @@ -947,8 +947,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r float min = FLT_MAX; float max = -FLT_MAX; - for (int l = 0; l < qk; l++) { - const float v = x[i*qk + l]; + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; if (v < min) min = v; if (v > max) max = v; @@ -962,18 +962,18 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r uint32_t qh = 0; - for (int l = 0; l < qk/2; ++l) { - const float x0 = (x[i*qk + 0 + l] - min)*id; - const float x1 = (x[i*qk + qk/2 + l] - min)*id; + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; const uint8_t xi0 = (uint8_t)(x0 + 0.5f); const uint8_t xi1 = (uint8_t)(x1 + 0.5f); - y[i].qs[l] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); // get the 5-th bit and store it in qh at the right position - qh |= ((xi0 & 0x10) >> 4) << (l + 0); - qh |= ((xi1 & 0x10) >> 4) << (l + qk/2); + qh |= ((xi0 & 0x10) >> 4) << (j + 0); + qh |= ((xi1 & 0x10) >> 4) << (j + qk/2); } memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); @@ -992,8 +992,8 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max - for (int l = 0; l < QK8_0; l++) { - const float v = x[i*QK8_0 + l]; + for (int j = 0; j < QK8_0; j++) { + const float v = x[i*QK8_0 + j]; amax = MAX(amax, fabsf(v)); } @@ -1002,10 +1002,10 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r y[i].d = d; - for (int l = 0; l < QK8_0; ++l) { - const float v0 = x[i*QK8_0 + l]*id; + for (int j = 0; j < QK8_0; ++j) { + const float v0 = x[i*QK8_0 + j]*id; - y[i].qs[l] = roundf(v0); + y[i].qs[j] = roundf(v0); } } } @@ -1146,8 +1146,8 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max - for (int l = 0; l < QK8_1; l++) { - const float v = x[i*QK8_1 + l]; + for (int j = 0; j < QK8_1; j++) { + const float v = x[i*QK8_1 + j]; amax = MAX(amax, fabsf(v)); } @@ -1159,15 +1159,15 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r int sum0 = 0; int sum1 = 0; - for (int l = 0; l < QK8_1/2; ++l) { - const float v0 = x[i*QK8_1 + l]*id; - const float v1 = x[i*QK8_1 + QK8_1/2 + l]*id; + for (int j = 0; j < QK8_1/2; ++j) { + const float v0 = x[i*QK8_1 + j]*id; + const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id; - y[i].qs[ l] = roundf(v0); - y[i].qs[QK8_1/2 + l] = roundf(v1); + y[i].qs[ j] = roundf(v0); + y[i].qs[QK8_1/2 + j] = roundf(v1); - sum0 += y[i].qs[ l]; - sum1 += y[i].qs[QK8_1/2 + l]; + sum0 += y[i].qs[ j]; + sum1 += y[i].qs[QK8_1/2 + j]; } y[i].s0 = d * sum0; @@ -1187,12 +1187,12 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int float32x4_t asrcv[8]; float32x4_t amaxv[8]; - for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); - for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]); + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); - for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]); - for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); - for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); const float amax = vmaxvq_f32(amaxv[0]); @@ -1205,27 +1205,27 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int int32x4_t accv1 = vdupq_n_s32(0); // low half - for (int l = 0; l < 4; l++) { - const float32x4_t v = vmulq_n_f32(srcv[l], id); + for (int j = 0; j < 4; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); const int32x4_t vi = vcvtnq_s32_f32(v); - y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3); + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); accv0 = vaddq_s32(accv0, vi); } // high half - for (int l = 4; l < 8; l++) { - const float32x4_t v = vmulq_n_f32(srcv[l], id); + for (int j = 4; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); const int32x4_t vi = vcvtnq_s32_f32(v); - y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3); + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); accv1 = vaddq_s32(accv1, vi); } @@ -1393,14 +1393,14 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict const uint8_t * qsp = b4_from_nibbles_64(qk, x[i].qs, qs); - for (int l = 0; l < qk; ++l) { - y[i*qk + l] = (qsp[l] - 8)*d; + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = (qsp[j] - 8)*d; } } } static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { - static const int qk = QK4_0; + static const int qk = QK5_0; assert(k % qk == 0); @@ -1453,18 +1453,19 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict } static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) { - assert(k % QK8_0 == 0); - const int nb = k / QK8_0; + static const int qk = QK8_0; + + assert(k % qk == 0); + + const int nb = k / qk; const block_q8_0 * restrict x = vx; for (int i = 0; i < nb; i++) { const float d = x[i].d; - const int8_t * restrict pp = x[i].qs; - - for (int l = 0; l < QK8_0; ++l) { - y[i*QK8_0 + l] = pp[l]*d; + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; } } } @@ -12314,15 +12315,15 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * assert(k % QK4_0 == 0); const int nb = k / QK4_0; - for (int j = 0; j < n; j += k) { - block_q4_0 * restrict y = (block_q4_0 *) dst + j/QK4_0; + for (int b = 0; b < n; b += k) { + block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0; - quantize_row_q4_0_reference(src + j, y, k); + quantize_row_q4_0_reference(src + b, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_0; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0x0F; - const uint8_t vi1 = y[i].qs[l/2] >> 4; + for (int j = 0; j < QK4_0; j += 2) { + const uint8_t vi0 = y[i].qs[j/2] & 0x0F; + const uint8_t vi1 = y[i].qs[j/2] >> 4; hist[vi0]++; hist[vi1]++; @@ -12337,15 +12338,15 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * assert(k % QK4_1 == 0); const int nb = k / QK4_1; - for (int j = 0; j < n; j += k) { - block_q4_1 * restrict y = (block_q4_1 *) dst + j/QK4_1; + for (int b = 0; b < n; b += k) { + block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1; - quantize_row_q4_1_reference(src + j, y, k); + quantize_row_q4_1_reference(src + b, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_1; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0x0F; - const uint8_t vi1 = y[i].qs[l/2] >> 4; + for (int j = 0; j < QK4_1; j += 2) { + const uint8_t vi0 = y[i].qs[j/2] & 0x0F; + const uint8_t vi1 = y[i].qs[j/2] >> 4; hist[vi0]++; hist[vi1]++; @@ -12360,15 +12361,15 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * assert(k % QK4_2 == 0); const int nb = k / QK4_2; - for (int j = 0; j < n; j += k) { - block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2; + for (int b = 0; b < n; b += k) { + block_q4_2 * restrict y = (block_q4_2 *)dst + b/QK4_2; - quantize_row_q4_2_reference(src + j, y, k); + quantize_row_q4_2_reference(src + b, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK4_2; l += 2) { - const uint8_t vi0 = y[i].qs[l/2] & 0x0F; - const uint8_t vi1 = y[i].qs[l/2] >> 4; + for (int j = 0; j < QK4_2; j += 2) { + const uint8_t vi0 = y[i].qs[j/2] & 0x0F; + const uint8_t vi1 = y[i].qs[j/2] >> 4; hist[vi0]++; hist[vi1]++; @@ -12383,22 +12384,22 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * assert(k % QK5_0 == 0); const int nb = k / QK5_0; - for (int j = 0; j < n; j += k) { - block_q5_0 * restrict y = (block_q5_0 *)dst + j/QK5_0; + for (int b = 0; b < n; b += k) { + block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0; - quantize_row_q5_0_reference(src + j, y, k); + quantize_row_q5_0_reference(src + b, y, k); for (int i = 0; i < nb; i++) { uint32_t qh; memcpy(&qh, &y[i].qh, sizeof(qh)); - for (int l = 0; l < QK5_0; l += 2) { - const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; + for (int j = 0; j < QK5_0; j += 2) { + const uint8_t vh0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4; + const uint8_t vh1 = ((qh & (1u << (j + 1))) >> (j + 1)) << 4; // cast to 16 bins - const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2; - const uint8_t vi1 = ((y[i].qs[l/2] >> 4) | vh1) / 2; + const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; + const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2; hist[vi0]++; hist[vi1]++; @@ -12413,22 +12414,22 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * assert(k % QK5_1 == 0); const int nb = k / QK5_1; - for (int j = 0; j < n; j += k) { - block_q5_1 * restrict y = (block_q5_1 *)dst + j/QK5_1; + for (int b = 0; b < n; b += k) { + block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1; - quantize_row_q5_1_reference(src + j, y, k); + quantize_row_q5_1_reference(src + b, y, k); for (int i = 0; i < nb; i++) { uint32_t qh; memcpy(&qh, &y[i].qh, sizeof(qh)); - for (int l = 0; l < QK5_1; l += 2) { - const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; + for (int j = 0; j < QK5_1; j += 2) { + const uint8_t vh0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4; + const uint8_t vh1 = ((qh & (1u << (j + 1))) >> (j + 1)) << 4; // cast to 16 bins - const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2; - const uint8_t vi1 = ((y[i].qs[l/2] >> 4) | vh1) / 2; + const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; + const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2; hist[vi0]++; hist[vi1]++; @@ -12443,14 +12444,14 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * assert(k % QK8_0 == 0); const int nb = k / QK8_0; - for (int j = 0; j < n; j += k) { - block_q8_0 * restrict y = (block_q8_0 *)dst + j/QK8_0; + for (int b = 0; b < n; b += k) { + block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0; - quantize_row_q8_0_reference(src + j, y, k); + quantize_row_q8_0_reference(src + b, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK8_0; ++l) { - const int8_t vi = y[i].qs[l]; + for (int j = 0; j < QK8_0; ++j) { + const int8_t vi = y[i].qs[j]; hist[vi/16 + 8]++; } From 4bf1c8a43e26ac706e1ca8cf78fa12e7203cda89 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 May 2023 18:26:59 +0300 Subject: [PATCH 12/32] ggml : remove Q4_2 mode --- examples/quantize/quantize.cpp | 11 +- ggml-cuda.cu | 37 ----- ggml-opencl.c | 30 +--- ggml.c | 286 --------------------------------- ggml.h | 4 +- llama.cpp | 4 - llama.h | 2 +- 7 files changed, 8 insertions(+), 366 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 7c77018daa344..115d8fb1ba36b 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -7,12 +7,11 @@ #include static const std::map LLAMA_FTYPE_MAP = { - {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, - {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, - {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2}, - {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, - {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, - {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, + {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, + {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, + {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, + {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, + {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, }; bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b1a9ffb3258a9..46f7b568c608b 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -49,13 +49,6 @@ typedef struct { } block_q4_1; static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); -#define QK4_2 16 -typedef struct { - half d; // delta - uint8_t qs[QK4_2 / 2]; // nibbles / quants -} block_q4_2; -static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); - #define QK5_0 32 typedef struct { half d; // delta @@ -117,29 +110,6 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) { } } -static __global__ void dequantize_block_q4_2(const void * vx, float * y) { - const block_q4_2 * x = (const block_q4_2 *) vx; - - const int i = blockIdx.x; - - const float d = x[i].d; - - const uint8_t * pp = x[i].qs; - - for (int l = 0; l < QK4_2; l += 2) { - const uint8_t vi = pp[l/2]; - - const int8_t vi0 = vi & 0xf; - const int8_t vi1 = vi >> 4; - - const float v0 = (vi0 - 8)*d; - const float v1 = (vi1 - 8)*d; - - y[i*QK4_2 + l + 0] = v0; - y[i*QK4_2 + l + 1] = v1; - } -} - static __global__ void dequantize_block_q5_0(const void * vx, float * y) { static const int qk = QK5_0; @@ -215,11 +185,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre dequantize_block_q4_1<<>>(vx, y); } -static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { - const int nb = k / QK4_2; - dequantize_block_q4_2<<>>(vx, y); -} - static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK5_0; dequantize_block_q5_0<<>>(vx, y); @@ -254,8 +219,6 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_q4_0_cuda; case GGML_TYPE_Q4_1: return dequantize_row_q4_1_cuda; - case GGML_TYPE_Q4_2: - return dequantize_row_q4_2_cuda; case GGML_TYPE_Q5_0: return dequantize_row_q5_0_cuda; case GGML_TYPE_Q5_1: diff --git a/ggml-opencl.c b/ggml-opencl.c index 4389eca393466..0e6e6770f6307 100644 --- a/ggml-opencl.c +++ b/ggml-opencl.c @@ -52,26 +52,6 @@ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global f result[index + 1] = (vi >> 4) * d + m; } -struct block_q4_2 -{ - ushort d; - uchar qs[8]; -}; - -__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) { - const uint i = get_global_id(0) / 16; - const uint l = get_local_id(0); - - const float d = vload_half(0, (__global half*) &blocks[i].d); - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*16 + l*2; - result[index + 0] = ((vi & 0xf) - 8)*d; - result[index + 1] = ((vi >> 4) - 8)*d; -} - - struct block_q5_0 { float d; @@ -167,7 +147,7 @@ static cl_device_id device; static cl_context context; static cl_command_queue queue; static cl_program program; -static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0; +static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0; static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c; static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0; @@ -238,8 +218,6 @@ void ggml_cl_init(void) { CL_CHECK(err, "clCreateKernel"); kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err); CL_CHECK(err, "clCreateKernel"); - kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err); - CL_CHECK(err, "clCreateKernel"); kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err); CL_CHECK(err, "clCreateKernel"); kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err); @@ -292,12 +270,6 @@ void ggml_cl_sgemm_wrapper( local = 16; size_qb = global * (sizeof(float) * 2 + local) / 32; break; - case GGML_TYPE_Q4_2: - dequant = true; - kernel = kernel_q4_2; - local = 8; - size_qb = global * (sizeof(ggml_fp16_t) + local) / 16; - break; case GGML_TYPE_Q5_0: dequant = true; kernel = kernel_q5_0; diff --git a/ggml.c b/ggml.c index 817782179a9d3..ea73bf8441e6f 100644 --- a/ggml.c +++ b/ggml.c @@ -615,18 +615,6 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) #if __ARM_NEON -// TODO: obosolete - will be removed -static inline const uint8_t * b4_from_nibbles_64(const int qk, const uint8_t * qs, uint64_t * qd) { - memcpy(qd, qs, qk/2); - - for (int l = 0; l < qk/16; ++l) { - qd[l + qk/16] = (qd[l] & 0xF0F0F0F0F0F0F0F0ULL) >> 4; - qd[l + 0 ] = (qd[l] & 0x0F0F0F0F0F0F0F0FULL) >> 0; - } - - return (const uint8_t *) qd; -} - #if !defined(__aarch64__) inline static uint16_t vaddvq_u8(uint8x16_t v) { @@ -719,13 +707,6 @@ typedef struct { } block_q4_1; static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding"); -#define QK4_2 16 -typedef struct { - ggml_fp16_t d; // delta - uint8_t qs[QK4_2 / 2]; // nibbles / quants -} block_q4_2; -static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); - #define QK5_0 32 typedef struct { ggml_fp16_t d; // delta @@ -842,52 +823,6 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k quantize_row_q4_1_reference(x, y, k); } -// reference implementation for deterministic creation of model files -static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) { - static const int qk = QK4_2; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - max = v; - } - } - - const float d = max / -8; - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - - uint64_t qs[QK4_2 / 16] = {0}; - - for (int j = 0; j < qk/2; ++j) { - const float x0 = x[i*qk + 0 + j]*id; - const float x1 = x[i*qk + qk/2 + j]*id; - - const uint64_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); - const uint64_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - - qs[j/8] |= xi0 << (8*(j & 7)); - qs[j/8] |= xi1 << (8*(j & 7) + 4); - } - - memcpy(y[i].qs, qs, qk/2); - } -} - -static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k) { - quantize_row_q4_2_reference(x, y, k); -} - static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { static const int qk = QK5_0; @@ -1378,27 +1313,6 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict } } -static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict y, int k) { - // BORKEN !!! - static const int qk = QK4_2; - - assert(k % qk == 0); - - const int nb = k / qk; - - uint64_t qs[QK4_2 / 8]; - - for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); - - const uint8_t * qsp = b4_from_nibbles_64(qk, x[i].qs, qs); - - for (int j = 0; j < qk; ++j) { - y[i*qk + j] = (qsp[j] - 8)*d; - } - } -} - static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { static const int qk = QK5_0; @@ -1472,7 +1386,6 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); @@ -1494,14 +1407,6 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_q = ggml_vec_dot_q4_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, }, - [GGML_TYPE_Q4_2] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_2, - .quantize_row_q = quantize_row_q4_2, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q4_2_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, - }, [GGML_TYPE_Q5_0] = { .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0, .quantize_row_q = quantize_row_q5_0, @@ -2414,159 +2319,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * #endif } -static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int qk = QK8_0; - const int nb = n / qk; - - assert(n % qk == 0); - assert(nb % 2 == 0); - - assert(qk == 2*QK4_2); - - const block_q4_2 * restrict x = vx; - const block_q8_0 * restrict y = vy; - -#if defined(__ARM_NEON) - float32x4_t sumv0 = vdupq_n_f32(0.0f); - float32x4_t sumv1 = vdupq_n_f32(0.0f); - - for (int i = 0; i < nb; i += 2) { - const block_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0]; - const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1]; - const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0]; - const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1]; - - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; - - const uint8x16_t m4b = vdupq_n_u8(0x0F); - const int8x16_t s8b = vdupq_n_s8(0x8); - - const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs)); - const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs)); - - // 4-bit -> 8-bit - const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); - const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - - // sub 8 - const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); - const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); - const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); - const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); - - // load y - const int8x16_t v1_0l = vld1q_s8(y0->qs); - const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const int8x16_t v1_1l = vld1q_s8(y1->qs); - const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - -#if defined(__ARM_FEATURE_DOTPROD) - sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)), - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hs, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d); - - sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)), - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hs, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( - vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_FP16_TO_FP32(x0_0->d)), - vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_FP16_TO_FP32(x0_1->d))), y0->d); - - sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( - vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_FP16_TO_FP32(x1_0->d)), - vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_FP16_TO_FP32(x1_1->d))), y1->d); -#endif - } - - *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); -#elif defined(__AVX2__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); - - // Main loop - for (int i = 0; i < nb; i++) { - /* Compute combined scale for the block */ - const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d)); - const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d)); - const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d)); - - __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs); - __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs); - __m256i bx = _mm256_set_m128i(bx1, bx0); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8(8); - bx = _mm256_sub_epi8(bx, off); - - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps(d, q, acc); - } - - *s = hsum_float_8(acc); -#else - // scalar - float sumf = 0.0; - for (int i = 0; i < nb; i++) { - const uint8_t * restrict x0 = x[2*i + 0].qs; - const uint8_t * restrict x1 = x[2*i + 1].qs; - const int8_t * restrict y0 = y[i].qs; - - const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d); - const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d); - - int sumi_0 = 0; - int sumi_1 = 0; - - for (int j = 0; j < QK8_0/4; j++) { - const uint8_t v0 = x0[j]; - const uint8_t v1 = x1[j]; - - const int i0_0 = (int8_t) (v0 & 0x0F) - 8; - const int i1_0 = (int8_t) (v0 >> 4) - 8; - - const int i0_1 = (int8_t) (v1 & 0x0F) - 8; - const int i1_1 = (int8_t) (v1 >> 4) - 8; - - const int i2_0 = y0[2*j + 0]; - const int i3_0 = y0[2*j + 1]; - - const int i2_1 = y0[2*(j + QK8_0/4) + 0]; - const int i3_1 = y0[2*(j + QK8_0/4) + 1]; - - sumi_0 += i0_0*i2_0 + i1_0*i3_0; - sumi_1 += i0_1*i2_1 + i1_1*i3_1; - } - - sumf += (d0 * y[i].d) * sumi_0; - sumf += (d1 * y[i].d) * sumi_1; - } - *s = sumf; -#endif -} - static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_0; const int nb = n / qk; @@ -3289,7 +3041,6 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = 1, [GGML_TYPE_Q4_0] = QK4_0, [GGML_TYPE_Q4_1] = QK4_1, - [GGML_TYPE_Q4_2] = QK4_2, [GGML_TYPE_Q5_0] = QK5_0, [GGML_TYPE_Q5_1] = QK5_1, [GGML_TYPE_Q8_0] = QK8_0, @@ -3305,7 +3056,6 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = sizeof(ggml_fp16_t), [GGML_TYPE_Q4_0] = sizeof(block_q4_0), [GGML_TYPE_Q4_1] = sizeof(block_q4_1), - [GGML_TYPE_Q4_2] = sizeof(block_q4_2), [GGML_TYPE_Q5_0] = sizeof(block_q5_0), [GGML_TYPE_Q5_1] = sizeof(block_q5_1), [GGML_TYPE_Q8_0] = sizeof(block_q8_0), @@ -3322,7 +3072,6 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = "f16", [GGML_TYPE_Q4_0] = "q4_0", [GGML_TYPE_Q4_1] = "q4_1", - [GGML_TYPE_Q4_2] = "q4_2", [GGML_TYPE_Q5_0] = "q5_0", [GGML_TYPE_Q5_1] = "q5_1", [GGML_TYPE_Q8_0] = "q8_0", @@ -3338,7 +3087,6 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_F16] = false, [GGML_TYPE_Q4_0] = true, [GGML_TYPE_Q4_1] = true, - [GGML_TYPE_Q4_2] = true, [GGML_TYPE_Q5_0] = true, [GGML_TYPE_Q5_1] = true, [GGML_TYPE_Q8_0] = true, @@ -3623,7 +3371,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; - case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break; case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; @@ -6624,7 +6371,6 @@ static void ggml_compute_forward_add( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -8179,7 +7925,6 @@ static void ggml_compute_forward_mul_mat( switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -8410,7 +8155,6 @@ static void ggml_compute_forward_get_rows( switch (src0->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -8735,7 +8479,6 @@ static void ggml_compute_forward_alibi( } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -12357,29 +12100,6 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_1*sizeof(block_q4_1)); } -size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK4_2 == 0); - const int nb = k / QK4_2; - - for (int b = 0; b < n; b += k) { - block_q4_2 * restrict y = (block_q4_2 *)dst + b/QK4_2; - - quantize_row_q4_2_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - for (int j = 0; j < QK4_2; j += 2) { - const uint8_t vi0 = y[i].qs[j/2] & 0x0F; - const uint8_t vi1 = y[i].qs[j/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } - } - } - - return (n/QK4_2*sizeof(block_q4_2)); -} - size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK5_0 == 0); const int nb = k / QK5_0; @@ -12476,12 +12196,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; result = ggml_quantize_q4_1(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q4_2: - { - GGML_ASSERT(start % QK4_2 == 0); - block_q4_2 * block = (block_q4_2*)dst + start / QK4_2; - result = ggml_quantize_q4_2(src + start, block, n, n, hist); - } break; case GGML_TYPE_Q5_0: { GGML_ASSERT(start % QK5_0 == 0); diff --git a/ggml.h b/ggml.h index 508dd69b41713..bb9a025e257d5 100644 --- a/ggml.h +++ b/ggml.h @@ -231,7 +231,7 @@ extern "C" { GGML_TYPE_F16 = 1, GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_1 = 3, - GGML_TYPE_Q4_2 = 4, + // GGML_TYPE_Q4_2 = 4, support has been removed // GGML_TYPE_Q4_3 (5) support has been removed GGML_TYPE_Q5_0 = 6, GGML_TYPE_Q5_1 = 7, @@ -251,7 +251,6 @@ extern "C" { GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors @@ -876,7 +875,6 @@ extern "C" { GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); diff --git a/llama.cpp b/llama.cpp index 4bba93a111ae4..5c6c3e72211fc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -482,7 +482,6 @@ struct llama_file_loader { case GGML_TYPE_F16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -558,7 +557,6 @@ struct llama_file_saver { case GGML_TYPE_F16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: @@ -852,7 +850,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: return "mostly Q4_1, some F16"; - case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; @@ -1905,7 +1902,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; - case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; diff --git a/llama.h b/llama.h index 58c6e0699a999..fea5ffeffbee0 100644 --- a/llama.h +++ b/llama.h @@ -78,7 +78,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors + // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors From b08c39b16c5f52ca656a92fe46a994868a87b082 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 May 2023 20:00:01 +0300 Subject: [PATCH 13/32] ggml : minor formatting --- ggml.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/ggml.c b/ggml.c index ea73bf8441e6f..7ab747ca3a801 100644 --- a/ggml.c +++ b/ggml.c @@ -1283,8 +1283,8 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict const float d = x[i].d; for (int j = 0; j < qk/2; ++j) { - const int x0 = (x[i].qs[j] & 0xf) - 8; - const int x1 = (x[i].qs[j] >> 4) - 8; + const int x0 = (x[i].qs[j] & 0x0F) - 8; + const int x1 = (x[i].qs[j] >> 4) - 8; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; @@ -1304,8 +1304,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict const float m = x[i].m; for (int j = 0; j < qk/2; ++j) { - const int x0 = (x[i].qs[j] & 0xf); - const int x1 = (x[i].qs[j] >> 4); + const int x0 = (x[i].qs[j] & 0x0F); + const int x1 = (x[i].qs[j] >> 4); y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; @@ -1330,8 +1330,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; - const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; y[i*qk + j + 0 ] = x0*d; y[i*qk + j + qk/2] = x1*d; @@ -1357,8 +1357,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - const int x0 = (x[i].qs[j] & 0xf) | xh_0; - const int x1 = (x[i].qs[j] >> 4) | xh_1; + const int x0 = (x[i].qs[j] & 0x0F) | xh_0; + const int x1 = (x[i].qs[j] >> 4) | xh_1; y[i*qk + j + 0 ] = x0*d + m; y[i*qk + j + qk/2] = x1*d + m; @@ -2184,8 +2184,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * int sumi = 0; for (int j = 0; j < qk/2; ++j) { - const int v0 = (x[i].qs[j] & 0xf) - 8; - const int v1 = (x[i].qs[j] >> 4) - 8; + const int v0 = (x[i].qs[j] & 0x0F) - 8; + const int v1 = (x[i].qs[j] >> 4) - 8; sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } @@ -2306,8 +2306,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * int sumi = 0; for (int j = 0; j < qk/2; ++j) { - const int v0 = (x[i].qs[j] & 0xf); - const int v1 = (x[i].qs[j] >> 4); + const int v0 = (x[i].qs[j] & 0x0F); + const int v1 = (x[i].qs[j] >> 4); sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } @@ -2487,8 +2487,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; - const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } From 83674556b8fbba86ec5a482ffac0154d11163526 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 7 May 2023 20:26:02 +0300 Subject: [PATCH 14/32] ggml : fix Q5_0 quantization --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 7ab747ca3a801..8ef279fd74b6a 100644 --- a/ggml.c +++ b/ggml.c @@ -845,7 +845,7 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r const float d = max / -16; const float id = d ? 1.0f/d : 0.0f; - y[i].d = d; + y[i].d = GGML_FP32_TO_FP16(d); uint32_t qh = 0; From 928d2f335f0e1998b702df3b68a7d670c839a63f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 8 May 2023 22:06:54 +0300 Subject: [PATCH 15/32] scripts : add script for measuring the time per token --- .gitignore | 1 + README.md | 24 +++++------ llama.cpp | 4 +- scripts/perf-run-all.sh | 93 +++++++++++++++++++++++++++++++++++++++++ scripts/ppl-run-all.sh | 4 -- 5 files changed, 108 insertions(+), 18 deletions(-) create mode 100755 scripts/perf-run-all.sh diff --git a/.gitignore b/.gitignore index a5fef327718f0..f5023e3042a81 100644 --- a/.gitignore +++ b/.gitignore @@ -44,5 +44,6 @@ zig-cache/ ppl-*.txt qnt-*.txt +perf-*.txt examples/jeopardy/results.txt diff --git a/README.md b/README.md index 045f995347154..75bd9faf57e5d 100644 --- a/README.md +++ b/README.md @@ -338,18 +338,18 @@ As the models are currently fully loaded into memory, you will need adequate dis Several quantization methods are supported. They differ in the resulting model disk size and inference speed. -| Model | Measure | F16 | Q4_0 | Q4_1 | Q4_2 | Q5_0 | Q5_1 | Q8_0 | -|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:| -| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 6.1466 | 5.9862 | 5.9481 | 5.9069 | -| 7B | file size | 13.0G | 4.0G | 4.8G | 4.0G | 4.4G | 4.8G | 7.1G | -| 7B | ms/tok @ 4th | 128 | 56 | 61 | 84 | 91 | 95 | 75 | -| 7B | ms/tok @ 8th | 128 | 47 | 55 | 48 | 53 | 59 | 75 | -| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 | -| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.3513 | 5.2856 | 5.2706 | 5.2548 | -| 13B | file size | 25.0G | 7.6G | 9.1G | 7.6G | 8.4G | 9.1G | 14G | -| 13B | ms/tok @ 4th | 239 | 104 | 113 | 160 | 176 | 185 | 141 | -| 13B | ms/tok @ 8th | 240 | 85 | 99 | 97 | 108 | 117 | 147 | -| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.0 | 5.5 | 6.0 | 9.0 | +| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | +|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:| +| 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 | +| 7B | file size | 13.0G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G | +| 7B | ms/tok @ 4th | 128 | 56 | 61 | 91 | 95 | 75 | +| 7B | ms/tok @ 8th | 128 | 47 | 55 | 53 | 59 | 75 | +| 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 | +| 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 | +| 13B | file size | 25.0G | 7.6G | 9.1G | 8.4G | 9.1G | 14G | +| 13B | ms/tok @ 4th | 239 | 104 | 113 | 176 | 185 | 141 | +| 13B | ms/tok @ 8th | 240 | 85 | 99 | 108 | 117 | 147 | +| 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 | ### Perplexity (measuring model quality) diff --git a/llama.cpp b/llama.cpp index 5c6c3e72211fc..367522ac9051a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2809,9 +2809,9 @@ void llama_print_timings(struct llama_context * ctx) { fprintf(stderr, "\n"); fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); - fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample); + fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample); fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval); - fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval); + fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval); fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0); } diff --git a/scripts/perf-run-all.sh b/scripts/perf-run-all.sh new file mode 100755 index 0000000000000..7dbfc7c2044e1 --- /dev/null +++ b/scripts/perf-run-all.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# +# Measure the performance (time per token) of the various quantization techniques +# + +QUANTIZE=0 +if [ "$1" != "" ]; then + echo "Quantizing" + QUANTIZE=1 +fi + +if [ "$QUANTIZE" != "0" ]; then + # + # quantize + # + + # 7B + time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt + time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt + time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt + time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt + time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt + + # 13B + time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt + time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt + time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt + time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt + time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt +fi + +# +# perf +# run each command twice +# + +set -x + +# 7B - 4 threads + ./bin/main -m ../models/7B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-f16.txt | grep llama_print_timings + ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings + ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings + ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings + ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings + ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings + +# 7B - 8 threads + ./bin/main -m ../models/7B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-f16.txt | grep llama_print_timings + ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings + ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings + ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings + ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings + ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings + +# 13B - 4 threads + ./bin/main -m ../models/13B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-f16.txt | grep llama_print_timings + ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings + ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings + ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings + ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings + ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings + +# 13B - 8 threads + ./bin/main -m ../models/13B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-f16.txt | grep llama_print_timings + ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings + ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings + ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings + ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings + ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe" +time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings diff --git a/scripts/ppl-run-all.sh b/scripts/ppl-run-all.sh index 28f31ca7130b7..c59e3075d26f2 100755 --- a/scripts/ppl-run-all.sh +++ b/scripts/ppl-run-all.sh @@ -7,7 +7,6 @@ # 7B time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt -time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-7b-q4_2.txt time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt @@ -15,7 +14,6 @@ time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0 # 13B time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt -time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-13b-q4_2.txt time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt @@ -28,7 +26,6 @@ time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8 time ./bin/perplexity -m ../models/7B/ggml-model-f16.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-f16.txt time ./bin/perplexity -m ../models/7B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_0.txt time ./bin/perplexity -m ../models/7B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_1.txt -time ./bin/perplexity -m ../models/7B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_2.txt time ./bin/perplexity -m ../models/7B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_0.txt time ./bin/perplexity -m ../models/7B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_1.txt time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q8_0.txt @@ -37,7 +34,6 @@ time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --n time ./bin/perplexity -m ../models/13B/ggml-model-f16.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-f16.txt time ./bin/perplexity -m ../models/13B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_0.txt time ./bin/perplexity -m ../models/13B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_1.txt -time ./bin/perplexity -m ../models/13B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_2.txt time ./bin/perplexity -m ../models/13B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_0.txt time ./bin/perplexity -m ../models/13B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_1.txt time ./bin/perplexity -m ../models/13B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q8_0.txt From 9e49d20150fe30d1a2be1f39233ddbe15448c6ff Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Mon, 8 May 2023 19:14:06 +0000 Subject: [PATCH 16/32] AVX implementations (#1370) --- SHA256SUMS | 16 +++-------- ggml.c | 82 +++++++++++++++++++----------------------------------- 2 files changed, 33 insertions(+), 65 deletions(-) diff --git a/SHA256SUMS b/SHA256SUMS index e487bdca6c9c2..9db08b597d0f9 100644 --- a/SHA256SUMS +++ b/SHA256SUMS @@ -1,24 +1,19 @@ 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin -99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin -cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin -25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496 models/7B/ggml-model-q4_2.bin +ae89af479ab4d31c4e555ad8cc1dc9bf1f68d617186158cc381cd5a0fccd10bd models/7B/ggml-model-q4_0.bin +862072e2036a1bdb1a01ec2e159381f332a9e2357b886031c075fb7efa86db9b models/7B/ggml-model-q4_1.bin +0bef7cefa880a67a0b6d2a7e4559ded235823535ad616808dd8b5e47ff0a202f models/7B/ggml-model-q5_0.bin +97b9c38b2b8aed0c0aa90e0a975570ce3455c47d62128b382c55acbf6e2035f6 models/7B/ggml-model-q5_1.bin 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin -eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin -d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin -75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa models/13B/ggml-model-q4_2.bin 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin -517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin -7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin -aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204 models/30B/ggml-model-q4_2.bin 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth @@ -29,8 +24,5 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin -01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin -4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin -1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9 models/65B/ggml-model-q4_2.bin 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model diff --git a/ggml.c b/ggml.c index 8ef279fd74b6a..c0131805dd7cc 100644 --- a/ggml.c +++ b/ggml.c @@ -472,23 +472,16 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // #if __AVX__ || __AVX2__ || __AVX512F__ -// Unpack 16 4-bit fields into 16 bytes -// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval -static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) -{ - // Load 8 bytes from memory - __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi ); - - // Expand bytes into uint16_t values - __m128i bytes = _mm_cvtepu8_epi16( tmp ); - - // Unpack values into individual bytes - const __m128i lowMask = _mm_set1_epi8( 0xF ); - __m128i high = _mm_andnot_si128( lowMask, bytes ); - __m128i low = _mm_and_si128( lowMask, bytes ); - high = _mm_slli_epi16( high, 4 ); - bytes = _mm_or_si128( low, high ); - return bytes; +// multiply int8_t, add results pairwise twice +static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(x, x); + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(y, x); + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + const __m128i ones = _mm_set1_epi16(1); + return _mm_madd_epi16(ones, dot); } // horizontally add 8 floats @@ -535,19 +528,10 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) { // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { - // Load 16 bytes from memory - __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); - - // Expand bytes into uint16_t values - __m256i bytes = _mm256_cvtepu8_epi16( tmp ); - - // Unpack values into individual bytes + const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); + const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp); const __m256i lowMask = _mm256_set1_epi8( 0xF ); - __m256i high = _mm256_andnot_si256( lowMask, bytes ); - __m256i low = _mm256_and_si256( lowMask, bytes ); - high = _mm256_slli_epi16( high, 4 ); - bytes = _mm256_or_si256( low, high ); - return bytes; + return _mm256_and_si256(lowMask, bytes); } // add int16_t pairwise and return as float vector @@ -2146,31 +2130,23 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * // Compute combined scale for the block const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - __m128i i32[2]; - for (int j = 0; j < 2; ++j) { - // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes - __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j); - __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j)); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m128i off = _mm_set1_epi8( 8 ); - bx = _mm_sub_epi8( bx, off ); + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); - // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(bx, bx); + const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); - // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(by, bx); + __m128i bx = _mm_and_si128(lowMask, tmp); + __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx, by); - // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); - - const __m128i ones = _mm_set1_epi16(1); - i32[j] = _mm_madd_epi16(ones, dot); - } + bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); + by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx, by); // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] )); + __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1)); // Apply the scale, and accumulate acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); } @@ -2484,8 +2460,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * int sumi = 0; for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; @@ -2673,8 +2649,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * int sumi = 0; for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; From 489bd13fadc34218166d71ee9b39712a292647de Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 8 May 2023 22:18:15 +0300 Subject: [PATCH 17/32] ggml : uniform 5th bit extraction --- ggml-cuda.cu | 8 ++++---- ggml.c | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 46f7b568c608b..f11d4dc23ddbc 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -123,8 +123,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) { memcpy(&qh, x[i].qh, sizeof(qh)); for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; @@ -148,8 +148,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) { memcpy(&qh, x[i].qh, sizeof(qh)); for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; diff --git a/ggml.c b/ggml.c index c0131805dd7cc..4335b10d0c387 100644 --- a/ggml.c +++ b/ggml.c @@ -1311,8 +1311,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict memcpy(&qh, x[i].qh, sizeof(qh)); for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; @@ -1338,8 +1338,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict memcpy(&qh, x[i].qh, sizeof(qh)); for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int x0 = (x[i].qs[j] & 0x0F) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; @@ -12090,8 +12090,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * memcpy(&qh, &y[i].qh, sizeof(qh)); for (int j = 0; j < QK5_0; j += 2) { - const uint8_t vh0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4; - const uint8_t vh1 = ((qh & (1u << (j + 1))) >> (j + 1)) << 4; + const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12)); // cast to 16 bins const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; @@ -12120,8 +12120,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * memcpy(&qh, &y[i].qh, sizeof(qh)); for (int j = 0; j < QK5_1; j += 2) { - const uint8_t vh0 = ((qh & (1u << (j + 0))) >> (j + 0)) << 4; - const uint8_t vh1 = ((qh & (1u << (j + 1))) >> (j + 1)) << 4; + const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12)); // cast to 16 bins const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; From d52172a509740daa9bffa81bb8c4de02dc1634e5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 May 2023 18:19:13 +0300 Subject: [PATCH 18/32] llama : produce error upon loading old model files --- llama.cpp | 15 +++++++++++++-- llama.h | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 367522ac9051a..334d4e1bc260d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -402,6 +402,7 @@ enum llama_file_version { LLAMA_FILE_VERSION_GGML, LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab LLAMA_FILE_VERSION_GGJT_V1, // added padding + LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format }; struct llama_file_loader { @@ -432,6 +433,8 @@ struct llama_file_loader { file_version = LLAMA_FILE_VERSION_GGMF_V1; } else if (magic == 'ggjt' && version == 1) { file_version = LLAMA_FILE_VERSION_GGJT_V1; + } else if (magic == 'ggjt' && version == 2) { + file_version = LLAMA_FILE_VERSION_GGJT_V2; } else { throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", magic, version); @@ -837,8 +840,8 @@ static const char *llama_file_version_name(llama_file_version version) { switch (version) { case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; - case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)"; - default: LLAMA_ASSERT(false); + case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1305)"; + case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; } } @@ -915,6 +918,14 @@ static void llama_model_load_internal( fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } + if (file_version != LLAMA_FILE_VERSION_GGJT_V2) { + if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && + hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && + hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { + throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)"); + } + } + if (vocab_only) { return; } diff --git a/llama.h b/llama.h index fea5ffeffbee0..1a65cd5892389 100644 --- a/llama.h +++ b/llama.h @@ -19,7 +19,7 @@ # define LLAMA_API #endif -#define LLAMA_FILE_VERSION 1 +#define LLAMA_FILE_VERSION 2 #define LLAMA_FILE_MAGIC 'ggjt' #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' #define LLAMA_SESSION_MAGIC 'ggsn' From 09032e0290b9393210e1280c05fb03b39e6d57ee Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 May 2023 18:25:28 +0300 Subject: [PATCH 19/32] llama : fix model magic/version write --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 334d4e1bc260d..1ba9a62b8cb50 100644 --- a/llama.cpp +++ b/llama.cpp @@ -529,8 +529,8 @@ struct llama_file_saver { write_vocab(); } void write_magic() { - file.write_u32('ggjt'); // magic - file.write_u32(1); // version + file.write_u32(LLAMA_FILE_MAGIC); // magic + file.write_u32(LLAMA_FILE_VERSION); // version } void write_hparams(enum llama_ftype new_ftype) { const llama_hparams & hparams = any_file_loader->hparams; From b7ad385d42f09d640e334283ebac46df3813a7b6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 10 May 2023 22:58:45 +0300 Subject: [PATCH 20/32] ggml : speed-up Q5_0 + Q5_1 at 4 threads --- ggml.c | 223 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 147 insertions(+), 76 deletions(-) diff --git a/ggml.c b/ggml.c index 4335b10d0c387..6efd51b864044 100644 --- a/ggml.c +++ b/ggml.c @@ -339,8 +339,9 @@ static float table_f32_f16[1 << 16]; #define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) #define B8(c,s ) B7(c,s, c), B7(c,s, s) -// precomputed tables for expanding 8bits to 8 bytes (shl 4) -static const uint64_t table_b2b_u[1 << 8] = { B8(00, 10) }; +// precomputed tables for expanding 8bits to 8 bytes: +static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 +static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, @@ -2307,68 +2308,102 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const block_q8_0 * restrict y = vy; #if defined(__ARM_NEON) - float32x4_t sumv = vdupq_n_f32(0.0f); + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); - uint64_t tmp[4]; + uint32_t qh0; + uint32_t qh1; - for (int i = 0; i < nb; ++i) { + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + for (int i = 0; i < nb; i += 2) { const block_q5_0 * restrict x0 = &x[i]; + const block_q5_0 * restrict x1 = &x[i + 1]; const block_q8_0 * restrict y0 = &y[i]; + const block_q8_0 * restrict y1 = &y[i + 1]; - const uint8x16_t m4b = vdupq_n_u8(0x0F); - const int8x16_t s16b = vdupq_n_s8(0x10); + const uint8x16_t m4b = vdupq_n_u8(0x0F); - // extract the 5th bit - uint32_t qh; - memcpy(&qh, x0->qh, sizeof(qh)); + // extract the 5th bit via lookup table ((!b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); - tmp[0] = table_b2b_u[(qh >> 0) & 0xFF]; - tmp[1] = table_b2b_u[(qh >> 8) & 0xFF]; - tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; - tmp[3] = table_b2b_u[(qh >> 24) ]; + tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_1[(qh0 >> 24) ]; - const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0)); - const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2)); + tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_1[(qh1 >> 24) ]; - const uint8x16_t v0 = vld1q_u8(x0->qs); + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); // 4-bit -> 8-bit - const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8 (v0, m4b)); - const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4)); + int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - // add high bit and sub 16 - const int8x16_t v0lf = vsubq_s8(vorrq_s8(v0l, qhl), s16b); - const int8x16_t v0hf = vsubq_s8(vorrq_s8(v0h, qhh), s16b); + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1); // load y - const int8x16_t v1l = vld1q_s8(y0->qs); - const int8x16_t v1h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); const float x0d = GGML_FP16_TO_FP32(x0->d); + const float x1d = GGML_FP16_TO_FP32(x1->d); #if defined(__ARM_FEATURE_DOTPROD) - sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0lf, v1l), - vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d); #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d); #endif } - *s = vaddvq_f32(sumv); + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); #elif defined(__wasm_simd128__) v128_t sumv = wasm_f32x4_splat(0.0f); + uint32_t qh; uint64_t tmp[4]; + // TODO: check if unrolling this is better for (int i = 0; i < nb; ++i) { const block_q5_0 * restrict x0 = &x[i]; const block_q8_0 * restrict y0 = &y[i]; @@ -2377,13 +2412,12 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const v128_t s16b = wasm_i8x16_splat(0x10); // extract the 5th bit - uint32_t qh; memcpy(&qh, x0->qh, sizeof(qh)); - tmp[0] = table_b2b_u[(qh >> 0) & 0xFF]; - tmp[1] = table_b2b_u[(qh >> 8) & 0xFF]; - tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; - tmp[3] = table_b2b_u[(qh >> 24) ]; + tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_1[(qh >> 24) ]; const v128_t qhl = wasm_v128_load(tmp + 0); const v128_t qhh = wasm_v128_load(tmp + 2); @@ -2395,8 +2429,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const v128_t v0h = wasm_u8x16_shr(v0, 4); // add high bit and sub 16 - const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0l, qhl), s16b); - const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0h, qhh), s16b); + const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); + const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); // load y const v128_t v1l = wasm_v128_load(y0->qs); @@ -2488,69 +2522,107 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const block_q8_1 * restrict y = vy; #if defined(__ARM_NEON) - float32x4_t sumv = vdupq_n_f32(0.0f); + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); - float summs = 0.0f; + float summs0 = 0.0f; + float summs1 = 0.0f; - uint64_t tmp[4]; + uint32_t qh0; + uint32_t qh1; - for (int i = 0; i < nb; ++i) { + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + for (int i = 0; i < nb; i += 2) { const block_q5_1 * restrict x0 = &x[i]; + const block_q5_1 * restrict x1 = &x[i + 1]; const block_q8_1 * restrict y0 = &y[i]; + const block_q8_1 * restrict y1 = &y[i + 1]; - summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); + const uint8x16_t m4b = vdupq_n_u8(0x0F); - // extract the 5th bit - uint32_t qh; - memcpy(&qh, x0->qh, sizeof(qh)); + summs0 += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); + summs1 += GGML_FP16_TO_FP32(x1->m) * (y1->s0 + y1->s1); + + // extract the 5th bit via lookup table ((b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); - tmp[0] = table_b2b_u[(qh >> 0) & 0xFF]; - tmp[1] = table_b2b_u[(qh >> 8) & 0xFF]; - tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; - tmp[3] = table_b2b_u[(qh >> 24) ]; + tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_0[(qh0 >> 24) ]; - const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0)); - const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2)); + tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_0[(qh1 >> 24) ]; - const uint8x16_t v0 = vld1q_u8(x0->qs); + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); // 4-bit -> 8-bit - const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8 (v0, vdupq_n_u8(0x0F))); - const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4)); + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - // add - const int8x16_t v0lf = vorrq_s8(v0l, qhl); - const int8x16_t v0hf = vorrq_s8(v0h, qhh); + // add 5th bit + const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1); // load y - const int8x16_t v1l = vld1q_s8(y0->qs); - const int8x16_t v1h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); const float x0d = GGML_FP16_TO_FP32(x0->d); + const float x1d = GGML_FP16_TO_FP32(x1->d); #if defined(__ARM_FEATURE_DOTPROD) - sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0lf, v1l), - vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d); #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d); #endif } - *s = vaddvq_f32(sumv) + summs; + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; #elif defined(__wasm_simd128__) v128_t sumv = wasm_f32x4_splat(0.0f); float summs = 0.0f; + uint32_t qh; uint64_t tmp[4]; for (int i = 0; i < nb; ++i) { @@ -2562,13 +2634,12 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const v128_t m4b = wasm_i8x16_splat(0x0F); // extract the 5th bit - uint32_t qh; memcpy(&qh, x0->qh, sizeof(qh)); - tmp[0] = table_b2b_u[(qh >> 0) & 0xFF]; - tmp[1] = table_b2b_u[(qh >> 8) & 0xFF]; - tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; - tmp[3] = table_b2b_u[(qh >> 24) ]; + tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_0[(qh >> 24) ]; const v128_t qhl = wasm_v128_load(tmp + 0); const v128_t qhh = wasm_v128_load(tmp + 2); From 695f3963b180c90a571c75e8e14d1cf1365632cc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 May 2023 19:46:11 +0300 Subject: [PATCH 21/32] ggml : preserve old Q4 and Q5 formats --- ggml-cuda.cu | 27 ++++++++------- ggml-opencl.c | 1 + ggml.c | 91 ++++++++++++++++++++++++++------------------------- llama.cpp | 4 +-- 4 files changed, 62 insertions(+), 61 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f11d4dc23ddbc..08d1566bdd880 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -86,8 +86,8 @@ static __global__ void dequantize_block_q4_0(const void * vx, float * y) { const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; + y[i*qk + 2*j + 0] = x0*d; + y[i*qk + 2*j + 1] = x1*d; } } @@ -105,8 +105,8 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) { const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; + y[i*qk + 2*j + 0] = x0*d + m; + y[i*qk + 2*j + 1] = x1*d + m; } } @@ -129,8 +129,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) { const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; + y[i*qk + 2*j + 0] = x0*d; + y[i*qk + 2*j + 1] = x1*d; } } @@ -154,24 +154,23 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) { const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; + y[i*qk + 2*j + 0] = x0*d + m; + y[i*qk + 2*j + 1] = x1*d + m; } } static __global__ void dequantize_block_q8_0(const void * vx, float * y) { + static const int qk = QK8_0; + const block_q8_0 * x = (const block_q8_0 *) vx; const int i = blockIdx.x; const float d = x[i].d; - const int8_t * pp = x[i].qs; - - for (int l = 0; l < QK8_0; l++) { - const int8_t vi = pp[l]; - - y[i*QK8_0 + l] = vi*d; + for (int j = 0; j < qk/2; ++j) { + y[i*qk + 2*j + 0] = x[i].qs[j + 0 ]*d; + y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d; } } diff --git a/ggml-opencl.c b/ggml-opencl.c index 0e6e6770f6307..230c84f2fb411 100644 --- a/ggml-opencl.c +++ b/ggml-opencl.c @@ -114,6 +114,7 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f const uint i = get_global_id(0) / 32; const uint l = get_local_id(0); + // TODO: this is broken result[i*32 + l] = blocks[i].qs[l] * blocks[i].d; } diff --git a/ggml.c b/ggml.c index 6efd51b864044..21c297e5a0cc6 100644 --- a/ggml.c +++ b/ggml.c @@ -751,8 +751,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r y[i].d = d; for (int j = 0; j < qk/2; ++j) { - const float x0 = x[i*qk + 0 + j]*id; - const float x1 = x[i*qk + qk/2 + j]*id; + const float x0 = x[i*qk + 2*j + 0]*id; + const float x1 = x[i*qk + 2*j + 1]*id; const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); @@ -792,8 +792,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r y[i].m = min; for (int j = 0; j < qk/2; ++j) { - const float x0 = (x[i*qk + 0 + j] - min)*id; - const float x1 = (x[i*qk + qk/2 + j] - min)*id; + const float x0 = (x[i*qk + 2*j + 0] - min)*id; + const float x1 = (x[i*qk + 2*j + 1] - min)*id; const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); @@ -835,8 +835,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r uint32_t qh = 0; for (int j = 0; j < qk/2; ++j) { - const float x0 = x[i*qk + 0 + j]*id; - const float x1 = x[i*qk + qk/2 + j]*id; + const float x0 = x[i*qk + 2*j + 0]*id; + const float x1 = x[i*qk + 2*j + 1]*id; const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); @@ -883,8 +883,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r uint32_t qh = 0; for (int j = 0; j < qk/2; ++j) { - const float x0 = (x[i*qk + 0 + j] - min)*id; - const float x1 = (x[i*qk + qk/2 + j] - min)*id; + const float x0 = (x[i*qk + 2*j + 0] - min)*id; + const float x1 = (x[i*qk + 2*j + 1] - min)*id; const uint8_t xi0 = (uint8_t)(x0 + 0.5f); const uint8_t xi1 = (uint8_t)(x1 + 0.5f); @@ -922,10 +922,12 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r y[i].d = d; - for (int j = 0; j < QK8_0; ++j) { - const float v0 = x[i*QK8_0 + j]*id; + for (int j = 0; j < QK8_0/2; ++j) { + const float v0 = x[i*QK8_0 + 2*j + 0]*id; + const float v1 = x[i*QK8_0 + 2*j + 1]*id; - y[i].qs[j] = roundf(v0); + y[i].qs[ j] = v0 + 0.5f; + y[i].qs[QK8_0/2 + j] = v1 + 0.5f; } } } @@ -943,12 +945,12 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int float32x4_t asrcv[8]; float32x4_t amaxv[8]; - for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); - for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]); + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); - for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]); - for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); - for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); const float amax = vmaxvq_f32(amaxv[0]); @@ -957,14 +959,14 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int y[i].d = d; - for (int l = 0; l < 8; l++) { - const float32x4_t v = vmulq_n_f32(srcv[l], id); + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); const int32x4_t vi = vcvtnq_s32_f32(v); - y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3); + y[i].qs[ 2*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1); + y[i].qs[ 2*j + 1] = vgetq_lane_s32(vi, 2); + y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3); } } #elif defined(__AVX2__) || defined(__AVX__) @@ -1080,11 +1082,11 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r int sum1 = 0; for (int j = 0; j < QK8_1/2; ++j) { - const float v0 = x[i*QK8_1 + j]*id; - const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id; + const float v0 = x[i*QK8_1 + 2*j + 0]*id; + const float v1 = x[i*QK8_1 + 2*j + 1]*id; - y[i].qs[ j] = roundf(v0); - y[i].qs[QK8_1/2 + j] = roundf(v1); + y[i].qs[ j] = v0 + 0.5f; + y[i].qs[QK8_1/2 + j] = v1 + 0.5f; sum0 += y[i].qs[ j]; sum1 += y[i].qs[QK8_1/2 + j]; @@ -1129,10 +1131,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int const float32x4_t v = vmulq_n_f32(srcv[j], id); const int32x4_t vi = vcvtnq_s32_f32(v); - y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + y[i].qs[ 2*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1); + y[i].qs[ 2*j + 1] = vgetq_lane_s32(vi, 2); + y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3); accv0 = vaddq_s32(accv0, vi); } @@ -1142,10 +1144,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int const float32x4_t v = vmulq_n_f32(srcv[j], id); const int32x4_t vi = vcvtnq_s32_f32(v); - y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); - y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); - y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + y[i].qs[ 2*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1); + y[i].qs[ 2*j + 1] = vgetq_lane_s32(vi, 2); + y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3); accv1 = vaddq_s32(accv1, vi); } @@ -1271,8 +1273,8 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict const int x0 = (x[i].qs[j] & 0x0F) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; + y[i*qk + 2*j + 0] = x0*d; + y[i*qk + 2*j + 1] = x1*d; } } } @@ -1292,8 +1294,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict const int x0 = (x[i].qs[j] & 0x0F); const int x1 = (x[i].qs[j] >> 4); - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; + y[i*qk + 2*j + 0] = x0*d + m; + y[i*qk + 2*j + 1] = x1*d + m; } } } @@ -1318,8 +1320,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; + y[i*qk + 2*j + 0] = x0*d; + y[i*qk + 2*j + 1] = x1*d; } } } @@ -1345,8 +1347,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict const int x0 = (x[i].qs[j] & 0x0F) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; + y[i*qk + 2*j + 0] = x0*d + m; + y[i*qk + 2*j + 1] = x1*d + m; } } } @@ -1363,8 +1365,9 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in for (int i = 0; i < nb; i++) { const float d = x[i].d; - for (int j = 0; j < qk; ++j) { - y[i*qk + j] = x[i].qs[j]*d; + for (int j = 0; j < qk/2; ++j) { + y[i*qk + 2*j + 0] = x[i].qs[j + 0 ]*d; + y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d; } } } diff --git a/llama.cpp b/llama.cpp index 1ba9a62b8cb50..be9f8fffb8084 100644 --- a/llama.cpp +++ b/llama.cpp @@ -919,9 +919,7 @@ static void llama_model_load_internal( } if (file_version != LLAMA_FILE_VERSION_GGJT_V2) { - if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && - hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && - hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { + if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)"); } } From 582a39fff532d2d57d5a41106a7f3a3309e49765 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 May 2023 20:11:37 +0300 Subject: [PATCH 22/32] ggml : simplify Q8_1 - no need for low / high sums anymore --- ggml.c | 69 +++++++++++++++++++--------------------------------------- 1 file changed, 22 insertions(+), 47 deletions(-) diff --git a/ggml.c b/ggml.c index 21c297e5a0cc6..619ce33d843e3 100644 --- a/ggml.c +++ b/ggml.c @@ -718,12 +718,11 @@ static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block siz #define QK8_1 32 typedef struct { - float d; // delta - float s0; // d * sum(qs[i]) low - float s1; // d * sum(qs[i]) high - int8_t qs[QK8_1]; // quants + float d; // delta + float s; // d * sum(qs[i]) + int8_t qs[QK8_1]; // quants } block_q8_1; -static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); +static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { @@ -1078,8 +1077,7 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r y[i].d = d; - int sum0 = 0; - int sum1 = 0; + int sum = 0; for (int j = 0; j < QK8_1/2; ++j) { const float v0 = x[i*QK8_1 + 2*j + 0]*id; @@ -1088,12 +1086,11 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r y[i].qs[ j] = v0 + 0.5f; y[i].qs[QK8_1/2 + j] = v1 + 0.5f; - sum0 += y[i].qs[ j]; - sum1 += y[i].qs[QK8_1/2 + j]; + sum += y[i].qs[ j]; + sum += y[i].qs[QK8_1/2 + j]; } - y[i].s0 = d * sum0; - y[i].s1 = d * sum1; + y[i].s = d * sum; } } @@ -1123,24 +1120,9 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int y[i].d = d; - int32x4_t accv0 = vdupq_n_s32(0); - int32x4_t accv1 = vdupq_n_s32(0); - - // low half - for (int j = 0; j < 4; j++) { - const float32x4_t v = vmulq_n_f32(srcv[j], id); - const int32x4_t vi = vcvtnq_s32_f32(v); + int32x4_t accv = vdupq_n_s32(0); - y[i].qs[ 2*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1); - y[i].qs[ 2*j + 1] = vgetq_lane_s32(vi, 2); - y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3); - - accv0 = vaddq_s32(accv0, vi); - } - - // high half - for (int j = 4; j < 8; j++) { + for (int j = 0; j < 8; j++) { const float32x4_t v = vmulq_n_f32(srcv[j], id); const int32x4_t vi = vcvtnq_s32_f32(v); @@ -1149,14 +1131,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int y[i].qs[ 2*j + 1] = vgetq_lane_s32(vi, 2); y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3); - accv1 = vaddq_s32(accv1, vi); + accv = vaddq_s32(accv, vi); } - const int32_t sum0 = vaddvq_s32(accv0); - const int32_t sum1 = vaddvq_s32(accv1); - - y[i].s0 = d * sum0; - y[i].s1 = d * sum1; + y[i].s = d * vaddvq_s32(accv); } #elif defined(__AVX2__) || defined(__AVX__) for (int i = 0; i < nb; i++) { @@ -1205,9 +1183,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int #if defined(__AVX2__) // Compute the sum of the quants and set y[i].s - //y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); - y[i].s0 = d * hsum_i32_8(_mm256_add_epi32(i0, i1)); - y[i].s1 = d * hsum_i32_8(_mm256_add_epi32(i2, i3)); + y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 @@ -1237,8 +1213,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int // Compute the sum of the quants and set y[i].s const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); - y[i].s0 = d * hsum_i32_4(s0); - y[i].s1 = d * hsum_i32_4(s1); + y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1)); // Convert int32 to int16 ni0 = _mm_packs_epi32( ni0, ni1 ); @@ -2200,7 +2175,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const block_q8_1 * restrict y0 = &y[i + 0]; const block_q8_1 * restrict y1 = &y[i + 1]; - summs += x0->m * (y0->s0 + y0->s1) + x1->m * (y1->s0 + y1->s1); + summs += x0->m * y0->s + x1->m * y1->s; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -2259,7 +2234,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const float * d0 = &x[i].d; const float * d1 = &y[i].d; - summs += x[i].m * (y[i].s0 + y[i].s1); + summs += x[i].m * y[i].s; const __m256 d0v = _mm256_broadcast_ss( d0 ); const __m256 d1v = _mm256_broadcast_ss( d1 ); @@ -2292,7 +2267,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - sumf += (x[i].d*y[i].d)*sumi + x[i].m*(y[i].s0 + y[i].s1); + sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s; } *s = sumf; @@ -2545,8 +2520,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const uint8x16_t m4b = vdupq_n_u8(0x0F); - summs0 += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); - summs1 += GGML_FP16_TO_FP32(x1->m) * (y1->s0 + y1->s1); + summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s; + summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s; // extract the 5th bit via lookup table ((b) << 4) memcpy(&qh0, x0->qh, sizeof(qh0)); @@ -2632,7 +2607,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const block_q5_1 * restrict x0 = &x[i]; const block_q8_1 * restrict y0 = &y[i]; - summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); + summs += GGML_FP16_TO_FP32(x0->m) * y0->s; const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -2696,7 +2671,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * for (int i = 0; i < nb; i++) { const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); - summs += GGML_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1); + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; __m256i bx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -2732,7 +2707,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*(y[i].s0 + y[i].s1); + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; From 6680244838dbddcd20a08eed39398520573c94fd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 May 2023 20:47:41 +0300 Subject: [PATCH 23/32] ggml : fix Q8_0 and Q8_1 rounding --- ggml.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml.c b/ggml.c index 619ce33d843e3..a39ae74b13291 100644 --- a/ggml.c +++ b/ggml.c @@ -925,8 +925,8 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r const float v0 = x[i*QK8_0 + 2*j + 0]*id; const float v1 = x[i*QK8_0 + 2*j + 1]*id; - y[i].qs[ j] = v0 + 0.5f; - y[i].qs[QK8_0/2 + j] = v1 + 0.5f; + y[i].qs[ j] = roundf(v0); + y[i].qs[QK8_0/2 + j] = roundf(v1); } } } @@ -1083,8 +1083,8 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r const float v0 = x[i*QK8_1 + 2*j + 0]*id; const float v1 = x[i*QK8_1 + 2*j + 1]*id; - y[i].qs[ j] = v0 + 0.5f; - y[i].qs[QK8_1/2 + j] = v1 + 0.5f; + y[i].qs[ j] = roundf(v0); + y[i].qs[QK8_1/2 + j] = roundf(v1); sum += y[i].qs[ j]; sum += y[i].qs[QK8_1/2 + j]; From bd5e373058de91d6f1dd8e58c108dea96fd86ec4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 May 2023 20:57:28 +0300 Subject: [PATCH 24/32] Revert "AVX implementations (#1370)" This reverts commit 948d124837f9d287d8490f41338e0e4cceb0814f. --- SHA256SUMS | 16 ++++++++--- ggml.c | 82 +++++++++++++++++++++++++++++++++++------------------- 2 files changed, 65 insertions(+), 33 deletions(-) diff --git a/SHA256SUMS b/SHA256SUMS index 9db08b597d0f9..e487bdca6c9c2 100644 --- a/SHA256SUMS +++ b/SHA256SUMS @@ -1,19 +1,24 @@ 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin -ae89af479ab4d31c4e555ad8cc1dc9bf1f68d617186158cc381cd5a0fccd10bd models/7B/ggml-model-q4_0.bin -862072e2036a1bdb1a01ec2e159381f332a9e2357b886031c075fb7efa86db9b models/7B/ggml-model-q4_1.bin -0bef7cefa880a67a0b6d2a7e4559ded235823535ad616808dd8b5e47ff0a202f models/7B/ggml-model-q5_0.bin -97b9c38b2b8aed0c0aa90e0a975570ce3455c47d62128b382c55acbf6e2035f6 models/7B/ggml-model-q5_1.bin +99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin +cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin +25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496 models/7B/ggml-model-q4_2.bin 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin +eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin +d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin +75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa models/13B/ggml-model-q4_2.bin 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin +517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin +7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin +aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204 models/30B/ggml-model-q4_2.bin 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth @@ -24,5 +29,8 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin +01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin +4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin +1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9 models/65B/ggml-model-q4_2.bin 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model diff --git a/ggml.c b/ggml.c index a39ae74b13291..30485d113b1ba 100644 --- a/ggml.c +++ b/ggml.c @@ -473,16 +473,23 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // #if __AVX__ || __AVX2__ || __AVX512F__ -// multiply int8_t, add results pairwise twice -static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { - // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(x, x); - // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(y, x); - // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); - const __m128i ones = _mm_set1_epi16(1); - return _mm_madd_epi16(ones, dot); +// Unpack 16 4-bit fields into 16 bytes +// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval +static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) +{ + // Load 8 bytes from memory + __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi ); + + // Expand bytes into uint16_t values + __m128i bytes = _mm_cvtepu8_epi16( tmp ); + + // Unpack values into individual bytes + const __m128i lowMask = _mm_set1_epi8( 0xF ); + __m128i high = _mm_andnot_si128( lowMask, bytes ); + __m128i low = _mm_and_si128( lowMask, bytes ); + high = _mm_slli_epi16( high, 4 ); + bytes = _mm_or_si128( low, high ); + return bytes; } // horizontally add 8 floats @@ -529,10 +536,19 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) { // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { - const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); - const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp); + // Load 16 bytes from memory + __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); + + // Expand bytes into uint16_t values + __m256i bytes = _mm256_cvtepu8_epi16( tmp ); + + // Unpack values into individual bytes const __m256i lowMask = _mm256_set1_epi8( 0xF ); - return _mm256_and_si256(lowMask, bytes); + __m256i high = _mm256_andnot_si256( lowMask, bytes ); + __m256i low = _mm256_and_si256( lowMask, bytes ); + high = _mm256_slli_epi16( high, 4 ); + bytes = _mm256_or_si256( low, high ); + return bytes; } // add int16_t pairwise and return as float vector @@ -2109,23 +2125,31 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * // Compute combined scale for the block const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - const __m128i lowMask = _mm_set1_epi8(0xF); - const __m128i off = _mm_set1_epi8(8); + __m128i i32[2]; + for (int j = 0; j < 2; ++j) { + // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes + __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j); + __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j)); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m128i off = _mm_set1_epi8( 8 ); + bx = _mm_sub_epi8( bx, off ); - const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(bx, bx); - __m128i bx = _mm_and_si128(lowMask, tmp); - __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); - bx = _mm_sub_epi8(bx, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx, by); + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(by, bx); - bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); - by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); - bx = _mm_sub_epi8(bx, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx, by); + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + + const __m128i ones = _mm_set1_epi16(1); + i32[j] = _mm_madd_epi16(ones, dot); + } // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1)); + __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] )); // Apply the scale, and accumulate acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); } @@ -2472,8 +2496,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * int sumi = 0; for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; @@ -2698,8 +2722,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * int sumi = 0; for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; From 5bc286ab18e20db719cc0959bab3803f119d404c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 May 2023 21:22:27 +0300 Subject: [PATCH 25/32] ggml : fix AVX2 implementation --- ggml.c | 106 +++++++++++++++++++++++++++------------------------------ 1 file changed, 51 insertions(+), 55 deletions(-) diff --git a/ggml.c b/ggml.c index 30485d113b1ba..f4e626433ee1f 100644 --- a/ggml.c +++ b/ggml.c @@ -473,23 +473,16 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // #if __AVX__ || __AVX2__ || __AVX512F__ -// Unpack 16 4-bit fields into 16 bytes -// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval -static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) -{ - // Load 8 bytes from memory - __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi ); - - // Expand bytes into uint16_t values - __m128i bytes = _mm_cvtepu8_epi16( tmp ); - - // Unpack values into individual bytes - const __m128i lowMask = _mm_set1_epi8( 0xF ); - __m128i high = _mm_andnot_si128( lowMask, bytes ); - __m128i low = _mm_and_si128( lowMask, bytes ); - high = _mm_slli_epi16( high, 4 ); - bytes = _mm_or_si128( low, high ); - return bytes; +// multiply int8_t, add results pairwise twice +static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(x, x); + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(y, x); + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + const __m128i ones = _mm_set1_epi16(1); + return _mm_madd_epi16(ones, dot); } // horizontally add 8 floats @@ -524,14 +517,21 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) { uint32_t x32; memcpy(&x32, x, sizeof(uint32_t)); const __m256i shuf_mask = _mm256_set_epi64x( - 0x0303030303030303, 0x0202020202020202, - 0x0101010101010101, 0x0000000000000000); + 0x0303030303030303, 0x0202020202020202, + 0x0101010101010101, 0x0000000000000000); __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); bytes = _mm256_or_si256(bytes, bit_mask); return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); } +static inline __m256i bytes_from_nibbles_32_deinterleave(const uint8_t * rsi) { + const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); + const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp); + const __m256i lowMask = _mm256_set1_epi8( 0xF ); + return _mm256_and_si256(lowMask, bytes); +} + // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) @@ -984,7 +984,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3); } } -#elif defined(__AVX2__) || defined(__AVX__) +#elif defined(__AVX2__) for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors __m256 v0 = _mm256_loadu_ps( x ); @@ -1029,7 +1029,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int __m256i i2 = _mm256_cvtps_epi32( v2 ); __m256i i3 = _mm256_cvtps_epi32( v3 ); -#if defined(__AVX2__) +#if defined(__AVX2__) // || defined(__AVX__) TODO // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 @@ -1037,10 +1037,11 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 // We got our precious signed bytes, but the order is now wrong - // These AVX2 pack instructions process 16-byte pieces independently - // The following instruction is fixing the order - const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); - i0 = _mm256_permutevar8x32_epi32( i0, perm ); + // TODO: find a smarter way to do this + i2 = _mm256_permute2f128_si256(i0, i0, 0x01); + i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15)); + i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1)); + i0 = _mm256_or_si256(i1, i2); _mm256_storeu_si256((__m256i *)y[i].qs, i0); #else @@ -1152,7 +1153,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int y[i].s = d * vaddvq_s32(accv); } -#elif defined(__AVX2__) || defined(__AVX__) +#elif defined(__AVX2__) for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors __m256 v0 = _mm256_loadu_ps( x ); @@ -1197,7 +1198,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int __m256i i2 = _mm256_cvtps_epi32( v2 ); __m256i i3 = _mm256_cvtps_epi32( v3 ); -#if defined(__AVX2__) +#if defined(__AVX2__) // || defined(__AVX__) TODO // Compute the sum of the quants and set y[i].s y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); @@ -1208,10 +1209,11 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 // We got our precious signed bytes, but the order is now wrong - // These AVX2 pack instructions process 16-byte pieces independently - // The following instruction is fixing the order - const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); - i0 = _mm256_permutevar8x32_epi32( i0, perm ); + // TODO: find a smarter way to do this + i2 = _mm256_permute2f128_si256(i0, i0, 0x01); + i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15)); + i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1)); + i0 = _mm256_or_si256(i1, i2); _mm256_storeu_si256((__m256i *)y[i].qs, i0); #else @@ -2101,7 +2103,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * /* Compute combined scale for the block */ const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs); // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. const __m256i off = _mm256_set1_epi8( 8 ); @@ -2125,31 +2127,24 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * // Compute combined scale for the block const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - __m128i i32[2]; - for (int j = 0; j < 2; ++j) { - // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes - __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j); - __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j)); + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m128i off = _mm_set1_epi8( 8 ); - bx = _mm_sub_epi8( bx, off ); + const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); - // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(bx, bx); + __m128i bx = _mm_and_si128(lowMask, tmp); + __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx, by); - // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(by, bx); - - // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); - - const __m128i ones = _mm_set1_epi16(1); - i32[j] = _mm_madd_epi16(ones, dot); - } + bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); + by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx, by); // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] )); + __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1)); + // Apply the scale, and accumulate acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); } @@ -2267,7 +2262,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - const __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs); const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); const __m256 xy = mul_sum_i8_pairs_float(bx, by); @@ -2471,7 +2466,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * /* Compute combined scale for the block */ const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); bx = _mm256_or_si256(bx, bxhi); @@ -2689,6 +2684,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); + float summs = 0.0f; // Main loop @@ -2697,7 +2693,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; - __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); bx = _mm256_or_si256(bx, bxhi); From e038e01e28fda0dd35f9feb230705072447d6cc6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 May 2023 21:33:29 +0300 Subject: [PATCH 26/32] sha : update hashes for 7B and 13B --- SHA256SUMS | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/SHA256SUMS b/SHA256SUMS index e487bdca6c9c2..c3f935a85b135 100644 --- a/SHA256SUMS +++ b/SHA256SUMS @@ -1,24 +1,27 @@ 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin -99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6 models/7B/ggml-model-q4_0.bin -cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe models/7B/ggml-model-q4_1.bin -25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496 models/7B/ggml-model-q4_2.bin +b734d7201dc7869855fe2861247178719607d96372f0fb1bf6a1c5810898a48f models/7B/ggml-model-q4_0.bin +1ea1d3e94d0012ee5c23ee5ee2c8909eb124a1e8e43c11108feb17879d8b9379 models/7B/ggml-model-q4_1.bin +3232f282b40e3330093acb96e7d4983ce15b80a7e38b49d035e83b9aab753671 models/7B/ggml-model-q5_0.bin +75b1e0ef9a7ba27d760e4239422e29a6ced0ff9c4f2537f1cc4754821bdb8d3e models/7B/ggml-model-q5_1.bin 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin -eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab models/13B/ggml-model-q4_0.bin -d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb models/13B/ggml-model-q4_1.bin -75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa models/13B/ggml-model-q4_2.bin +a8dd1a853a3227abda5b2046dcc23b1f06ee8b837bc97b34f6b182229eca21ff models/13B/ggml-model-q4_0.bin +3a58a576f0e188ad77bc5104407f1c7cf129928d1af2f920099fa206ca6af34a models/13B/ggml-model-q4_1.bin +814f9e369ca0daf4517b6a66bdf8d616c5d4ae8b4353fe091d15080e66965c34 models/13B/ggml-model-q5_0.bin +74ab4eacb6ef14e08c7f06a2dd0b2630c3f920149324acf6651222ed397c430f models/13B/ggml-model-q5_1.bin 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin -517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d models/30B/ggml-model-q4_0.bin -7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd models/30B/ggml-model-q4_1.bin -aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204 models/30B/ggml-model-q4_2.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_0.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth @@ -29,8 +32,9 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/con 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin -01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2 models/65B/ggml-model-q4_0.bin -4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f models/65B/ggml-model-q4_1.bin -1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9 models/65B/ggml-model-q4_2.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_0.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model From 51c25fd99570db81d0a4c03041f89cf88630918d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 May 2023 21:38:47 +0300 Subject: [PATCH 27/32] readme : update timings + remove warning banner --- README.md | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 75bd9faf57e5d..396e3be657b54 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,6 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ -## ⚠️ TEMPORARY NOTICE ABOUT UPCOMING BREAKING CHANGE ⚠️ - -**The quantization formats will soon be updated: https://github.com/ggerganov/llama.cpp/pull/1305** - -**All `ggml` model files using the old format will not work with the latest `llama.cpp` code after that change is merged** - ---- - **Hot topics:** - [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220) @@ -342,13 +334,13 @@ Several quantization methods are supported. They differ in the resulting model d |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:| | 7B | perplexity | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 | | 7B | file size | 13.0G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G | -| 7B | ms/tok @ 4th | 128 | 56 | 61 | 91 | 95 | 75 | -| 7B | ms/tok @ 8th | 128 | 47 | 55 | 53 | 59 | 75 | +| 7B | ms/tok @ 4th | 128 | 50 | 54 | 75 | 83 | 75 | +| 7B | ms/tok @ 8th | 123 | 44 | 52 | 53 | 58 | 72 | | 7B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 | | 13B | perplexity | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 | | 13B | file size | 25.0G | 7.6G | 9.1G | 8.4G | 9.1G | 14G | -| 13B | ms/tok @ 4th | 239 | 104 | 113 | 176 | 185 | 141 | -| 13B | ms/tok @ 8th | 240 | 85 | 99 | 108 | 117 | 147 | +| 13B | ms/tok @ 4th | 239 | 93 | 101 | 150 | 164 | 141 | +| 13B | ms/tok @ 8th | 240 | 81 | 96 | 96 | 104 | 136 | | 13B | bits/weight | 16.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0 | ### Perplexity (measuring model quality) From 1c87847b6bf10cf4ecc1d6b4b96f9d8b9449820c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 May 2023 21:48:56 +0300 Subject: [PATCH 28/32] llama : update v2 PR number to 1405 --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index be9f8fffb8084..b2dbc6c3b0862 100644 --- a/llama.cpp +++ b/llama.cpp @@ -840,7 +840,7 @@ static const char *llama_file_version_name(llama_file_version version) { switch (version) { case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; - case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1305)"; + case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; } } From 832c53f4274353ec6f16a88d1c0e830526a229fc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 May 2023 21:59:25 +0300 Subject: [PATCH 29/32] ggml : fix WASM comments --- ggml.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index f4e626433ee1f..a9c10a295cdea 100644 --- a/ggml.c +++ b/ggml.c @@ -2425,7 +2425,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const v128_t v0l = wasm_v128_and (v0, m4b); const v128_t v0h = wasm_u8x16_shr(v0, 4); - // add high bit and sub 16 + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); @@ -2570,7 +2570,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); - // add 5th bit + // add high bit const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0); const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0); const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1); @@ -2622,6 +2622,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * uint32_t qh; uint64_t tmp[4]; + // TODO: check if unrolling this is better for (int i = 0; i < nb; ++i) { const block_q5_1 * restrict x0 = &x[i]; const block_q8_1 * restrict y0 = &y[i]; From ca7f069f39251a0289c99d0d55c373b2e181a381 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 May 2023 23:33:07 +0300 Subject: [PATCH 30/32] ggml : back to original bit order --- SHA256SUMS | 16 +++---- ggml-cuda.cu | 21 ++++----- ggml-opencl.c | 1 - ggml.c | 125 +++++++++++++++++++++----------------------------- llama.cpp | 4 +- 5 files changed, 73 insertions(+), 94 deletions(-) diff --git a/SHA256SUMS b/SHA256SUMS index c3f935a85b135..593c8efaa2bb7 100644 --- a/SHA256SUMS +++ b/SHA256SUMS @@ -1,17 +1,17 @@ 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin -b734d7201dc7869855fe2861247178719607d96372f0fb1bf6a1c5810898a48f models/7B/ggml-model-q4_0.bin -1ea1d3e94d0012ee5c23ee5ee2c8909eb124a1e8e43c11108feb17879d8b9379 models/7B/ggml-model-q4_1.bin -3232f282b40e3330093acb96e7d4983ce15b80a7e38b49d035e83b9aab753671 models/7B/ggml-model-q5_0.bin -75b1e0ef9a7ba27d760e4239422e29a6ced0ff9c4f2537f1cc4754821bdb8d3e models/7B/ggml-model-q5_1.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_0.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin -a8dd1a853a3227abda5b2046dcc23b1f06ee8b837bc97b34f6b182229eca21ff models/13B/ggml-model-q4_0.bin -3a58a576f0e188ad77bc5104407f1c7cf129928d1af2f920099fa206ca6af34a models/13B/ggml-model-q4_1.bin -814f9e369ca0daf4517b6a66bdf8d616c5d4ae8b4353fe091d15080e66965c34 models/13B/ggml-model-q5_0.bin -74ab4eacb6ef14e08c7f06a2dd0b2630c3f920149324acf6651222ed397c430f models/13B/ggml-model-q5_1.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_0.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin +ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 08d1566bdd880..8a3beb0e54b88 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -86,8 +86,8 @@ static __global__ void dequantize_block_q4_0(const void * vx, float * y) { const int x0 = (x[i].qs[j] & 0xf) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; - y[i*qk + 2*j + 0] = x0*d; - y[i*qk + 2*j + 1] = x1*d; + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; } } @@ -105,8 +105,8 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) { const int x0 = (x[i].qs[j] & 0xf); const int x1 = (x[i].qs[j] >> 4); - y[i*qk + 2*j + 0] = x0*d + m; - y[i*qk + 2*j + 1] = x1*d + m; + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; } } @@ -129,8 +129,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) { const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - y[i*qk + 2*j + 0] = x0*d; - y[i*qk + 2*j + 1] = x1*d; + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; } } @@ -154,8 +154,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) { const int x0 = (x[i].qs[j] & 0xf) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; - y[i*qk + 2*j + 0] = x0*d + m; - y[i*qk + 2*j + 1] = x1*d + m; + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; } } @@ -168,9 +168,8 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) { const float d = x[i].d; - for (int j = 0; j < qk/2; ++j) { - y[i*qk + 2*j + 0] = x[i].qs[j + 0 ]*d; - y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d; + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; } } diff --git a/ggml-opencl.c b/ggml-opencl.c index 230c84f2fb411..0e6e6770f6307 100644 --- a/ggml-opencl.c +++ b/ggml-opencl.c @@ -114,7 +114,6 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f const uint i = get_global_id(0) / 32; const uint l = get_local_id(0); - // TODO: this is broken result[i*32 + l] = blocks[i].qs[l] * blocks[i].d; } diff --git a/ggml.c b/ggml.c index a9c10a295cdea..096ccacfb7e08 100644 --- a/ggml.c +++ b/ggml.c @@ -525,30 +525,14 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) { return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); } -static inline __m256i bytes_from_nibbles_32_deinterleave(const uint8_t * rsi) { - const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); - const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp); - const __m256i lowMask = _mm256_set1_epi8( 0xF ); - return _mm256_and_si256(lowMask, bytes); -} - // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { - // Load 16 bytes from memory - __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); - - // Expand bytes into uint16_t values - __m256i bytes = _mm256_cvtepu8_epi16( tmp ); - - // Unpack values into individual bytes + const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); + const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp); const __m256i lowMask = _mm256_set1_epi8( 0xF ); - __m256i high = _mm256_andnot_si256( lowMask, bytes ); - __m256i low = _mm256_and_si256( lowMask, bytes ); - high = _mm256_slli_epi16( high, 4 ); - bytes = _mm256_or_si256( low, high ); - return bytes; + return _mm256_and_si256(lowMask, bytes); } // add int16_t pairwise and return as float vector @@ -766,8 +750,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r y[i].d = d; for (int j = 0; j < qk/2; ++j) { - const float x0 = x[i*qk + 2*j + 0]*id; - const float x1 = x[i*qk + 2*j + 1]*id; + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); @@ -807,8 +791,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r y[i].m = min; for (int j = 0; j < qk/2; ++j) { - const float x0 = (x[i*qk + 2*j + 0] - min)*id; - const float x1 = (x[i*qk + 2*j + 1] - min)*id; + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); @@ -850,8 +834,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r uint32_t qh = 0; for (int j = 0; j < qk/2; ++j) { - const float x0 = x[i*qk + 2*j + 0]*id; - const float x1 = x[i*qk + 2*j + 1]*id; + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); @@ -898,8 +882,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r uint32_t qh = 0; for (int j = 0; j < qk/2; ++j) { - const float x0 = (x[i*qk + 2*j + 0] - min)*id; - const float x1 = (x[i*qk + 2*j + 1] - min)*id; + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; const uint8_t xi0 = (uint8_t)(x0 + 0.5f); const uint8_t xi1 = (uint8_t)(x1 + 0.5f); @@ -937,12 +921,10 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r y[i].d = d; - for (int j = 0; j < QK8_0/2; ++j) { - const float v0 = x[i*QK8_0 + 2*j + 0]*id; - const float v1 = x[i*QK8_0 + 2*j + 1]*id; + for (int j = 0; j < QK8_0; ++j) { + const float x0 = x[i*QK8_0 + j]*id; - y[i].qs[ j] = roundf(v0); - y[i].qs[QK8_0/2 + j] = roundf(v1); + y[i].qs[j] = roundf(x0); } } } @@ -978,13 +960,13 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int const float32x4_t v = vmulq_n_f32(srcv[j], id); const int32x4_t vi = vcvtnq_s32_f32(v); - y[i].qs[ 2*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1); - y[i].qs[ 2*j + 1] = vgetq_lane_s32(vi, 2); - y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3); + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); } } -#elif defined(__AVX2__) +#elif defined(__AVX2__) || defined(__AVX__) for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors __m256 v0 = _mm256_loadu_ps( x ); @@ -1029,7 +1011,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int __m256i i2 = _mm256_cvtps_epi32( v2 ); __m256i i3 = _mm256_cvtps_epi32( v3 ); -#if defined(__AVX2__) // || defined(__AVX__) TODO +#if defined(__AVX2__) // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 @@ -1037,11 +1019,10 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 // We got our precious signed bytes, but the order is now wrong - // TODO: find a smarter way to do this - i2 = _mm256_permute2f128_si256(i0, i0, 0x01); - i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15)); - i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1)); - i0 = _mm256_or_si256(i1, i2); + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); _mm256_storeu_si256((__m256i *)y[i].qs, i0); #else @@ -1097,8 +1078,8 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r int sum = 0; for (int j = 0; j < QK8_1/2; ++j) { - const float v0 = x[i*QK8_1 + 2*j + 0]*id; - const float v1 = x[i*QK8_1 + 2*j + 1]*id; + const float v0 = x[i*QK8_1 + j]*id; + const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id; y[i].qs[ j] = roundf(v0); y[i].qs[QK8_1/2 + j] = roundf(v1); @@ -1143,17 +1124,17 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int const float32x4_t v = vmulq_n_f32(srcv[j], id); const int32x4_t vi = vcvtnq_s32_f32(v); - y[i].qs[ 2*j + 0] = vgetq_lane_s32(vi, 0); - y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1); - y[i].qs[ 2*j + 1] = vgetq_lane_s32(vi, 2); - y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3); + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); accv = vaddq_s32(accv, vi); } y[i].s = d * vaddvq_s32(accv); } -#elif defined(__AVX2__) +#elif defined(__AVX2__) || defined(__AVX__) for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors __m256 v0 = _mm256_loadu_ps( x ); @@ -1198,7 +1179,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int __m256i i2 = _mm256_cvtps_epi32( v2 ); __m256i i3 = _mm256_cvtps_epi32( v3 ); -#if defined(__AVX2__) // || defined(__AVX__) TODO +#if defined(__AVX2__) // Compute the sum of the quants and set y[i].s y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); @@ -1209,11 +1190,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 // We got our precious signed bytes, but the order is now wrong - // TODO: find a smarter way to do this - i2 = _mm256_permute2f128_si256(i0, i0, 0x01); - i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15)); - i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1)); - i0 = _mm256_or_si256(i1, i2); + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); _mm256_storeu_si256((__m256i *)y[i].qs, i0); #else @@ -1266,8 +1246,8 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict const int x0 = (x[i].qs[j] & 0x0F) - 8; const int x1 = (x[i].qs[j] >> 4) - 8; - y[i*qk + 2*j + 0] = x0*d; - y[i*qk + 2*j + 1] = x1*d; + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; } } } @@ -1287,8 +1267,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict const int x0 = (x[i].qs[j] & 0x0F); const int x1 = (x[i].qs[j] >> 4); - y[i*qk + 2*j + 0] = x0*d + m; - y[i*qk + 2*j + 1] = x1*d + m; + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; } } } @@ -1313,8 +1293,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; - y[i*qk + 2*j + 0] = x0*d; - y[i*qk + 2*j + 1] = x1*d; + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; } } } @@ -1340,8 +1320,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict const int x0 = (x[i].qs[j] & 0x0F) | xh_0; const int x1 = (x[i].qs[j] >> 4) | xh_1; - y[i*qk + 2*j + 0] = x0*d + m; - y[i*qk + 2*j + 1] = x1*d + m; + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; } } } @@ -1358,9 +1338,8 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in for (int i = 0; i < nb; i++) { const float d = x[i].d; - for (int j = 0; j < qk/2; ++j) { - y[i*qk + 2*j + 0] = x[i].qs[j + 0 ]*d; - y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d; + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; } } } @@ -2103,7 +2082,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * /* Compute combined scale for the block */ const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs); + __m256i bx = bytes_from_nibbles_32(x[i].qs); // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. const __m256i off = _mm256_set1_epi8( 8 ); @@ -2262,7 +2241,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - const __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs); + const __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); const __m256 xy = mul_sum_i8_pairs_float(bx, by); @@ -2466,7 +2445,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * /* Compute combined scale for the block */ const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); - __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs); + __m256i bx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); bx = _mm256_or_si256(bx, bxhi); @@ -2694,7 +2673,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; - __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs); + __m256i bx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); bx = _mm256_or_si256(bx, bxhi); @@ -2719,8 +2698,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * int sumi = 0; for (int j = 0; j < qk/2; ++j) { - const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; - const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; diff --git a/llama.cpp b/llama.cpp index b2dbc6c3b0862..b27eb91e4f258 100644 --- a/llama.cpp +++ b/llama.cpp @@ -919,7 +919,9 @@ static void llama_model_load_internal( } if (file_version != LLAMA_FILE_VERSION_GGJT_V2) { - if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { + if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && + hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && + hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)"); } } From b58b1f4bf6d5f7a4086d39fe269fbc06857cbf4f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 May 2023 00:00:40 +0300 Subject: [PATCH 31/32] readme : add note that Q4 and Q5 have been changed --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 396e3be657b54..8bc051c6b91f1 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- Qauntization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405) - [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220) -- [New quantization methods](https://github.com/ggerganov/llama.cpp#quantization)
Table of Contents From cbb6a3a7e8e70ced0371ae3b41f4cd2d4dd97333 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 12 May 2023 00:08:36 +0300 Subject: [PATCH 32/32] llama : fix return for unknown version --- llama.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama.cpp b/llama.cpp index b27eb91e4f258..0a47faa9d738d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -843,6 +843,8 @@ static const char *llama_file_version_name(llama_file_version version) { case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; } + + return "unknown"; } static const char *llama_ftype_name(enum llama_ftype ftype) {