diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h index bd85eda79..220de4cef 100644 --- a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h +++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h @@ -33,27 +33,40 @@ using SQuantizer = ScalarQuantizer::SQuantizer; */ struct Codec8bit { - static void encode_component(float x, uint8_t* code, int i) { + static FAISS_ALWAYS_INLINE void encode_component( + float x, + uint8_t* code, + int i) { code[i] = (int)(255 * x); } - static float decode_component(const uint8_t* code, int i) { + static FAISS_ALWAYS_INLINE float decode_component( + const uint8_t* code, + int i) { return (code[i] + 0.5f) / 255.0f; } }; struct Codec4bit { - static void encode_component(float x, uint8_t* code, int i) { + static FAISS_ALWAYS_INLINE void encode_component( + float x, + uint8_t* code, + int i) { code[i / 2] |= (int)(x * 15.0) << ((i & 1) << 2); } - static float decode_component(const uint8_t* code, int i) { + static FAISS_ALWAYS_INLINE float decode_component( + const uint8_t* code, + int i) { return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f; } }; struct Codec6bit { - static void encode_component(float x, uint8_t* code, int i) { + static FAISS_ALWAYS_INLINE void encode_component( + float x, + uint8_t* code, + int i) { int bits = (int)(x * 63.0); code += (i >> 2) * 3; switch (i & 3) { @@ -74,7 +87,9 @@ struct Codec6bit { } } - static float decode_component(const uint8_t* code, int i) { + static FAISS_ALWAYS_INLINE float decode_component( + const uint8_t* code, + int i) { uint8_t bits = 0x00; code += (i >> 2) * 3; switch (i & 3) { @@ -136,7 +151,8 @@ struct QuantizerTemplate : SQuantizer { } } - float reconstruct_component(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) + const { float xi = Codec::decode_component(code, i); return vmin + xi * vdiff; } @@ -173,7 +189,8 @@ struct QuantizerTemplate : SQuantizer { } } - float reconstruct_component(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) + const { float xi = Codec::decode_component(code, i); return vmin[i] + xi * vdiff[i]; } @@ -204,7 +221,8 @@ struct QuantizerFP16<1> : SQuantizer { } } - float reconstruct_component(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) + const { return decode_fp16(((uint16_t*)code)[i]); } }; @@ -235,7 +253,8 @@ struct Quantizer8bitDirect<1> : SQuantizer { } } - float reconstruct_component(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i) + const { return code[i]; } }; @@ -291,22 +310,22 @@ struct SimilarityL2<1> { float accu; - void begin() { + FAISS_ALWAYS_INLINE void begin() { accu = 0; yi = y; } - void add_component(float x) { + FAISS_ALWAYS_INLINE void add_component(float x) { float tmp = *yi++ - x; accu += tmp * tmp; } - void add_component_2(float x1, float x2) { + FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) { float tmp = x1 - x2; accu += tmp * tmp; } - float result() { + FAISS_ALWAYS_INLINE float result() { return accu; } }; @@ -324,20 +343,20 @@ struct SimilarityIP<1> { explicit SimilarityIP(const float* y) : y(y) {} - void begin() { + FAISS_ALWAYS_INLINE void begin() { accu = 0; yi = y; } - void add_component(float x) { + FAISS_ALWAYS_INLINE void add_component(float x) { accu += *yi++ * x; } - void add_component_2(float x1, float x2) { + FAISS_ALWAYS_INLINE void add_component_2(float x1, float x2) { accu += x1 * x2; } - float result() { + FAISS_ALWAYS_INLINE float result() { return accu; } }; diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h index fef86c9be..ba829371a 100644 --- a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h +++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h @@ -7,14 +7,14 @@ #pragma once -#include -#include -#include #include +#include +#include +#include -#include #include #include +#include namespace faiss { @@ -30,7 +30,8 @@ using SQuantizer = ScalarQuantizer::SQuantizer; */ struct Codec8bit_avx : public Codec8bit { - static __m256 decode_8_components(const uint8_t* code, int i) { + static FAISS_ALWAYS_INLINE __m256 + decode_8_components(const uint8_t* code, int i) { const uint64_t c8 = *(uint64_t*)(code + i); const __m128i i8 = _mm_set1_epi64x(c8); @@ -43,7 +44,8 @@ struct Codec8bit_avx : public Codec8bit { }; struct Codec4bit_avx : public Codec4bit { - static __m256 decode_8_components(const uint8_t* code, int i) { + static FAISS_ALWAYS_INLINE __m256 + decode_8_components(const uint8_t* code, int i) { uint32_t c4 = *(uint32_t*)(code + (i >> 1)); uint32_t mask = 0x0f0f0f0f; uint32_t c4ev = c4 & mask; @@ -67,7 +69,7 @@ struct Codec4bit_avx : public Codec4bit { struct Codec6bit_avx : public Codec6bit { /* Load 6 bytes that represent 8 6-bit values, return them as a * 8*32 bit vector register */ - static __m256i load6(const uint16_t* code16) { + static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) { const __m128i perm = _mm_set_epi8( -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0); const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0); @@ -86,7 +88,10 @@ struct Codec6bit_avx : public Codec6bit { return c5; } - static __m256 decode_8_components(const uint8_t* code, int i) { + static FAISS_ALWAYS_INLINE __m256 + decode_8_components(const uint8_t* code, int i) { + // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here + // // for the reference, maybe, it becomes used oned day. // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3); // const uint32_t* data32 = (const uint32_t*)data16; // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32); @@ -97,15 +102,14 @@ struct Codec6bit_avx : public Codec6bit { // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); // return _mm256_fmadd_ps(f8, one_255, half_one_255); - + __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3)); __m256 f8 = _mm256_cvtepi32_ps(i8); // this could also be done with bit manipulations but it is // not obviously faster - __m256 half = _mm256_set1_ps(0.5f); - f8 = _mm256_add_ps(f8, half); - __m256 one_63 = _mm256_set1_ps(1.f / 63.f); - return _mm256_mul_ps(f8, one_63); + const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f); + const __m256 one_255 = _mm256_set1_ps(1.f / 63.f); + return _mm256_fmadd_ps(f8, one_255, half_one_255); } }; @@ -130,11 +134,11 @@ struct QuantizerTemplate_avx QuantizerTemplate_avx(size_t d, const std::vector& trained) : QuantizerTemplate(d, trained) {} - __m256 reconstruct_8_components(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE __m256 + reconstruct_8_components(const uint8_t* code, int i) const { __m256 xi = Codec::decode_8_components(code, i); - return _mm256_add_ps( - _mm256_set1_ps(this->vmin), - _mm256_mul_ps(xi, _mm256_set1_ps(this->vdiff))); + return _mm256_fmadd_ps( + xi, _mm256_set1_ps(this->vdiff), _mm256_set1_ps(this->vmin)); } }; @@ -151,11 +155,13 @@ struct QuantizerTemplate_avx QuantizerTemplate_avx(size_t d, const std::vector& trained) : QuantizerTemplate(d, trained) {} - __m256 reconstruct_8_components(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE __m256 + reconstruct_8_components(const uint8_t* code, int i) const { __m256 xi = Codec::decode_8_components(code, i); - return _mm256_add_ps( - _mm256_loadu_ps(this->vmin + i), - _mm256_mul_ps(xi, _mm256_loadu_ps(this->vdiff + i))); + return _mm256_fmadd_ps( + xi, + _mm256_loadu_ps(this->vdiff + i), + _mm256_loadu_ps(this->vmin + i)); } }; @@ -177,7 +183,8 @@ struct QuantizerFP16_avx<8> : public QuantizerFP16<1> { QuantizerFP16_avx(size_t d, const std::vector& trained) : QuantizerFP16<1>(d, trained) {} - __m256 reconstruct_8_components(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE __m256 + reconstruct_8_components(const uint8_t* code, int i) const { __m128i codei = _mm_loadu_si128((const __m128i*)(code + 2 * i)); return _mm256_cvtph_ps(codei); } @@ -201,10 +208,11 @@ struct Quantizer8bitDirect_avx<8> : public Quantizer8bitDirect<1> { Quantizer8bitDirect_avx(size_t d, const std::vector& trained) : Quantizer8bitDirect<1>(d, trained) {} - __m256 reconstruct_8_components(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE __m256 + reconstruct_8_components(const uint8_t* code, int i) const { __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 - __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 - return _mm256_cvtepi32_ps(y8); // 8 * float32 + __m256i y8 = _mm256_cvtepu8_epi32(x8); // 8 * int32 + return _mm256_cvtepi32_ps(y8); // 8 * float32 } }; @@ -254,7 +262,7 @@ struct SimilarityL2_avx<1> : public SimilarityL2<1> { explicit SimilarityL2_avx(const float* y) : SimilarityL2<1>(y) {} }; -template<> +template <> struct SimilarityL2_avx<8> { static constexpr int simdwidth = 8; static constexpr MetricType metric_type = METRIC_L2; @@ -264,29 +272,31 @@ struct SimilarityL2_avx<8> { explicit SimilarityL2_avx(const float* y) : y(y) {} __m256 accu8; - void begin_8() { + FAISS_ALWAYS_INLINE void begin_8() { accu8 = _mm256_setzero_ps(); yi = y; } - void add_8_components(__m256 x) { + FAISS_ALWAYS_INLINE void add_8_components(__m256 x) { __m256 yiv = _mm256_loadu_ps(yi); yi += 8; __m256 tmp = _mm256_sub_ps(yiv, x); - accu8 = _mm256_add_ps(accu8, _mm256_mul_ps(tmp, tmp)); + accu8 = _mm256_fmadd_ps(tmp, tmp, accu8); } - void add_8_components_2(__m256 x, __m256 y) { + FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x, __m256 y) { __m256 tmp = _mm256_sub_ps(y, x); - accu8 = _mm256_add_ps(accu8, _mm256_mul_ps(tmp, tmp)); + accu8 = _mm256_fmadd_ps(tmp, tmp, accu8); } - float result_8() { - __m256 sum = _mm256_hadd_ps(accu8, accu8); - __m256 sum2 = _mm256_hadd_ps(sum, sum); - // now add the 0th and 4th component - return _mm_cvtss_f32(_mm256_castps256_ps128(sum2)) + - _mm_cvtss_f32(_mm256_extractf128_ps(sum2, 1)); + FAISS_ALWAYS_INLINE float result_8() { + const __m128 sum = _mm_add_ps( + _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1)); + const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); + const __m128 v1 = _mm_add_ps(sum, v0); + __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); + const __m128 v3 = _mm_add_ps(v1, v2); + return _mm_cvtss_f32(v3); } }; @@ -314,27 +324,29 @@ struct SimilarityIP_avx<8> { __m256 accu8; - void begin_8() { + FAISS_ALWAYS_INLINE void begin_8() { accu8 = _mm256_setzero_ps(); yi = y; } - void add_8_components(__m256 x) { + FAISS_ALWAYS_INLINE void add_8_components(__m256 x) { __m256 yiv = _mm256_loadu_ps(yi); yi += 8; - accu8 = _mm256_add_ps(accu8, _mm256_mul_ps(yiv, x)); + accu8 = _mm256_fmadd_ps(yiv, x, accu8); } - void add_8_components_2(__m256 x1, __m256 x2) { - accu8 = _mm256_add_ps(accu8, _mm256_mul_ps(x1, x2)); + FAISS_ALWAYS_INLINE void add_8_components_2(__m256 x1, __m256 x2) { + accu8 = _mm256_fmadd_ps(x1, x2, accu8); } - float result_8() { - __m256 sum = _mm256_hadd_ps(accu8, accu8); - __m256 sum2 = _mm256_hadd_ps(sum, sum); - // now add the 0th and 4th component - return _mm_cvtss_f32(_mm256_castps256_ps128(sum2)) + - _mm_cvtss_f32(_mm256_extractf128_ps(sum2, 1)); + FAISS_ALWAYS_INLINE float result_8() { + const __m128 sum = _mm_add_ps( + _mm256_castps256_ps128(accu8), _mm256_extractf128_ps(accu8, 1)); + const __m128 v0 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)); + const __m128 v1 = _mm_add_ps(sum, v0); + __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1)); + const __m128 v3 = _mm_add_ps(v1, v2); + return _mm_cvtss_f32(v3); } }; @@ -349,7 +361,7 @@ struct DCTemplate_avx : SQDistanceComputer {}; template struct DCTemplate_avx : public DCTemplate { - DCTemplate_avx(size_t d, const std::vector &trained) + DCTemplate_avx(size_t d, const std::vector& trained) : DCTemplate(d, trained) {} }; @@ -404,16 +416,14 @@ struct DCTemplate_avx : SQDistanceComputer { } void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3 - ) const override final { - + const uint8_t* __restrict code_0, + const uint8_t* __restrict code_1, + const uint8_t* __restrict code_2, + const uint8_t* __restrict code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const override final { Similarity sim0(q); Similarity sim1(q); Similarity sim2(q); @@ -454,7 +464,7 @@ struct DistanceComputerByte_avx : SQDistanceComputer {}; template struct DistanceComputerByte_avx : public DistanceComputerByte { - DistanceComputerByte_avx(int d, const std::vector &unused) + DistanceComputerByte_avx(int d, const std::vector& unused) : DistanceComputerByte(d, unused) {} }; @@ -529,8 +539,8 @@ struct DistanceComputerByte_avx : SQDistanceComputer { * specialization *******************************************************************/ -template -SQDistanceComputer *select_distance_computer_avx( +template +SQDistanceComputer* select_distance_computer_avx( QuantizerType qtype, size_t d, const std::vector& trained) { @@ -593,10 +603,11 @@ InvertedListScanner* sel2_InvertedListScanner_avx( bool store_pairs, const IDSelector* sel, bool r) { - return sel2_InvertedListScanner(sq, quantizer, store_pairs, sel, r); + return sel2_InvertedListScanner( + sq, quantizer, store_pairs, sel, r); } -template +template InvertedListScanner* sel12_InvertedListScanner_avx( const ScalarQuantizer* sq, const Index* quantizer, @@ -606,10 +617,11 @@ InvertedListScanner* sel12_InvertedListScanner_avx( constexpr int SIMDWIDTH = Similarity::simdwidth; using QuantizerClass = QuantizerTemplate_avx; using DCClass = DCTemplate_avx; - return sel2_InvertedListScanner_avx(sq, quantizer, store_pairs, sel, r); + return sel2_InvertedListScanner_avx( + sq, quantizer, store_pairs, sel, r); } -template +template InvertedListScanner* sel1_InvertedListScanner_avx( const ScalarQuantizer* sq, const Index* quantizer, @@ -619,20 +631,30 @@ InvertedListScanner* sel1_InvertedListScanner_avx( constexpr int SIMDWIDTH = Similarity::simdwidth; switch (sq->qtype) { case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner_avx( - sq, quantizer, store_pairs, sel, r); + return sel12_InvertedListScanner_avx< + Similarity, + Codec8bit_avx, + true>(sq, quantizer, store_pairs, sel, r); case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner_avx( - sq, quantizer, store_pairs, sel, r); + return sel12_InvertedListScanner_avx< + Similarity, + Codec4bit_avx, + true>(sq, quantizer, store_pairs, sel, r); case QuantizerType::QT_8bit: - return sel12_InvertedListScanner_avx( - sq, quantizer, store_pairs, sel, r); + return sel12_InvertedListScanner_avx< + Similarity, + Codec8bit_avx, + false>(sq, quantizer, store_pairs, sel, r); case QuantizerType::QT_4bit: - return sel12_InvertedListScanner_avx( - sq, quantizer, store_pairs, sel, r); + return sel12_InvertedListScanner_avx< + Similarity, + Codec4bit_avx, + false>(sq, quantizer, store_pairs, sel, r); case QuantizerType::QT_6bit: - return sel12_InvertedListScanner_avx( - sq, quantizer, store_pairs, sel, r); + return sel12_InvertedListScanner_avx< + Similarity, + Codec6bit_avx, + false>(sq, quantizer, store_pairs, sel, r); case QuantizerType::QT_fp16: return sel2_InvertedListScanner_avx, @@ -655,7 +677,7 @@ InvertedListScanner* sel1_InvertedListScanner_avx( return nullptr; } -template +template InvertedListScanner* sel0_InvertedListScanner_avx( MetricType mt, const ScalarQuantizer* sq, diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h index ab0365a57..87ffb08aa 100644 --- a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h +++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h @@ -7,14 +7,14 @@ #pragma once -#include -#include -#include #include +#include +#include +#include -#include #include #include +#include namespace faiss { @@ -30,7 +30,8 @@ using SQuantizer = ScalarQuantizer::SQuantizer; */ struct Codec8bit_avx512 : public Codec8bit_avx { - static __m512 decode_16_components(const uint8_t *code, int i) { + static FAISS_ALWAYS_INLINE __m512 + decode_16_components(const uint8_t* code, int i) { const __m128i c8 = _mm_loadu_si128((const __m128i_u*)(code + i)); const __m512i i32 = _mm512_cvtepu8_epi32(c8); const __m512 f8 = _mm512_cvtepi32_ps(i32); @@ -41,30 +42,31 @@ struct Codec8bit_avx512 : public Codec8bit_avx { }; struct Codec4bit_avx512 : public Codec4bit_avx { - static __m512 decode_16_components(const uint8_t *code, int i) { + static FAISS_ALWAYS_INLINE __m512 + decode_16_components(const uint8_t* code, int i) { uint64_t c8 = *(uint64_t*)(code + (i >> 1)); uint64_t mask = 0x0f0f0f0f0f0f0f0f; uint64_t c8ev = c8 & mask; uint64_t c8od = (c8 >> 4) & mask; // the 8 lower bytes of c8 contain the values - __m128i c16 = _mm_unpacklo_epi8( - _mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od)); + __m128i c16 = + _mm_unpacklo_epi8(_mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od)); __m256i c8lo = _mm256_cvtepu8_epi32(c16); __m256i c8hi = _mm256_cvtepu8_epi32(_mm_srli_si128(c16, 4)); __m512i i16 = _mm512_castsi256_si512(c8lo); i16 = _mm512_inserti32x8(i16, c8hi, 1); __m512 f16 = _mm512_cvtepi32_ps(i16); - __m512 half = _mm512_set1_ps(0.5f); - f16 = _mm512_add_ps(f16, half); - __m512 one_255 = _mm512_set1_ps(1.f / 15.f); - return _mm512_mul_ps(f16, one_255); + const __m512 half_one_255 = _mm512_set1_ps(0.5f / 15.f); + const __m512 one_255 = _mm512_set1_ps(1.f / 15.f); + return _mm512_fmadd_ps(f16, one_255, half_one_255); } }; struct Codec6bit_avx512 : public Codec6bit_avx { // TODO: can be optimized - static __m512 decode_16_components(const uint8_t* code, int i) { + static FAISS_ALWAYS_INLINE __m512 + decode_16_components(const uint8_t* code, int i) { // // todo aguzhva: the following piece of code is very fast // // for Intel chips. AMD ones will be very slow unless Zen3+ // @@ -85,23 +87,23 @@ struct Codec6bit_avx512 : public Codec6bit_avx { // const __m512 one_255 = _mm512_set1_ps(1.f / 63.f); // return _mm512_fmadd_ps(f8, one_255, half_one_255); - return _mm512_set_ps - (decode_component(code, i + 15), - decode_component(code, i + 14), - decode_component(code, i + 13), - decode_component(code, i + 12), - decode_component(code, i + 11), - decode_component(code, i + 10), - decode_component(code, i + 9), - decode_component(code, i + 8), - decode_component(code, i + 7), - decode_component(code, i + 6), - decode_component(code, i + 5), - decode_component(code, i + 4), - decode_component(code, i + 3), - decode_component(code, i + 2), - decode_component(code, i + 1), - decode_component(code, i + 0)); + return _mm512_set_ps( + decode_component(code, i + 15), + decode_component(code, i + 14), + decode_component(code, i + 13), + decode_component(code, i + 12), + decode_component(code, i + 11), + decode_component(code, i + 10), + decode_component(code, i + 9), + decode_component(code, i + 8), + decode_component(code, i + 7), + decode_component(code, i + 6), + decode_component(code, i + 5), + decode_component(code, i + 4), + decode_component(code, i + 3), + decode_component(code, i + 2), + decode_component(code, i + 1), + decode_component(code, i + 0)); } }; @@ -133,11 +135,11 @@ struct QuantizerTemplate_avx512 QuantizerTemplate_avx512(size_t d, const std::vector& trained) : QuantizerTemplate_avx(d, trained) {} - __m512 reconstruct_16_components(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE __m512 + reconstruct_16_components(const uint8_t* code, int i) const { __m512 xi = Codec::decode_16_components(code, i); - return _mm512_add_ps( - _mm512_set1_ps(this->vmin), - _mm512_mul_ps(xi, _mm512_set1_ps(this->vdiff))); + return _mm512_fmadd_ps( + xi, _mm512_set1_ps(this->vdiff), _mm512_set1_ps(this->vmin)); } }; @@ -151,7 +153,7 @@ struct QuantizerTemplate_avx512 template struct QuantizerTemplate_avx512 : public QuantizerTemplate_avx { - QuantizerTemplate_avx512(size_t d, const std::vector&trained) + QuantizerTemplate_avx512(size_t d, const std::vector& trained) : QuantizerTemplate_avx(d, trained) {} }; @@ -161,11 +163,13 @@ struct QuantizerTemplate_avx512 QuantizerTemplate_avx512(size_t d, const std::vector& trained) : QuantizerTemplate_avx(d, trained) {} - __m512 reconstruct_16_components(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE __m512 + reconstruct_16_components(const uint8_t* code, int i) const { __m512 xi = Codec::decode_16_components(code, i); - return _mm512_add_ps( - _mm512_loadu_ps(this->vmin + i), - _mm512_mul_ps(xi, _mm512_loadu_ps(this->vdiff + i))); + return _mm512_fmadd_ps( + xi, + _mm512_loadu_ps(this->vdiff + i), + _mm512_loadu_ps(this->vmin + i)); } }; @@ -193,7 +197,8 @@ struct QuantizerFP16_avx512<16> : public QuantizerFP16_avx<8> { QuantizerFP16_avx512(size_t d, const std::vector& trained) : QuantizerFP16_avx<8>(d, trained) {} - __m512 reconstruct_16_components(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE __m512 + reconstruct_16_components(const uint8_t* code, int i) const { __m256i codei = _mm256_loadu_si256((const __m256i*)(code + 2 * i)); return _mm512_cvtph_ps(codei); } @@ -223,7 +228,8 @@ struct Quantizer8bitDirect_avx512<16> : public Quantizer8bitDirect_avx<8> { Quantizer8bitDirect_avx512(size_t d, const std::vector& trained) : Quantizer8bitDirect_avx<8>(d, trained) {} - __m512 reconstruct_16_components(const uint8_t* code, int i) const { + FAISS_ALWAYS_INLINE __m512 + reconstruct_16_components(const uint8_t* code, int i) const { __m256i x16 = _mm256_loadu_si256((__m256i*)(code + i)); // 16 * int8 __m512i y16 = _mm512_cvtepu8_epi16(x16); // 16 * int32 return _mm512_cvtepi32_ps(y16); // 16 * float32 @@ -237,20 +243,30 @@ SQuantizer* select_quantizer_1_avx512( const std::vector& trained) { switch (qtype) { case QuantizerType::QT_8bit: - return new QuantizerTemplate_avx512( - d, trained); + return new QuantizerTemplate_avx512< + Codec8bit_avx512, + false, + SIMDWIDTH>(d, trained); case QuantizerType::QT_6bit: - return new QuantizerTemplate_avx512( - d, trained); + return new QuantizerTemplate_avx512< + Codec6bit_avx512, + false, + SIMDWIDTH>(d, trained); case QuantizerType::QT_4bit: - return new QuantizerTemplate_avx512( - d, trained); + return new QuantizerTemplate_avx512< + Codec4bit_avx512, + false, + SIMDWIDTH>(d, trained); case QuantizerType::QT_8bit_uniform: - return new QuantizerTemplate_avx512( - d, trained); + return new QuantizerTemplate_avx512< + Codec8bit_avx512, + true, + SIMDWIDTH>(d, trained); case QuantizerType::QT_4bit_uniform: - return new QuantizerTemplate_avx512( - d, trained); + return new QuantizerTemplate_avx512< + Codec4bit_avx512, + true, + SIMDWIDTH>(d, trained); case QuantizerType::QT_fp16: return new QuantizerFP16_avx512(d, trained); case QuantizerType::QT_8bit_direct: @@ -291,33 +307,28 @@ struct SimilarityL2_avx512<16> { const float *y, *yi; - explicit SimilarityL2_avx512(const float* y): y(y) {} + explicit SimilarityL2_avx512(const float* y) : y(y) {} __m512 accu16; - void begin_16() { + FAISS_ALWAYS_INLINE void begin_16() { accu16 = _mm512_setzero_ps(); yi = y; } - void add_16_components(__m512 x) { + FAISS_ALWAYS_INLINE void add_16_components(__m512 x) { __m512 yiv = _mm512_loadu_ps(yi); yi += 16; __m512 tmp = _mm512_sub_ps(yiv, x); - accu16 = _mm512_add_ps(accu16, _mm512_mul_ps(tmp, tmp)); + accu16 = _mm512_fmadd_ps(tmp, tmp, accu16); } - void add_16_components_2(__m512 x, __m512 y) { + FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x, __m512 y) { __m512 tmp = _mm512_sub_ps(y, x); - accu16 = _mm512_add_ps(accu16, _mm512_mul_ps(tmp, tmp)); + accu16 = _mm512_fmadd_ps(tmp, tmp, accu16); } - float result_16() { - __m256 sum0 = _mm512_extractf32x8_ps(accu16, 1) + _mm512_extractf32x8_ps(accu16, 0); - __m256 sum1 = _mm256_hadd_ps(sum0, sum0); - __m256 sum2 = _mm256_hadd_ps(sum1, sum1); - // now add the 0th and 4th component - return _mm_cvtss_f32(_mm256_castps256_ps128(sum2)) + - _mm_cvtss_f32(_mm256_extractf128_ps(sum2, 1)); + FAISS_ALWAYS_INLINE float result_16() { + return _mm512_reduce_add_ps(accu16); } }; @@ -353,28 +364,23 @@ struct SimilarityIP_avx512<16> { __m512 accu16; - void begin_16() { + FAISS_ALWAYS_INLINE void begin_16() { accu16 = _mm512_setzero_ps(); yi = y; } - void add_16_components(__m512 x) { + FAISS_ALWAYS_INLINE void add_16_components(__m512 x) { __m512 yiv = _mm512_loadu_ps(yi); yi += 16; - accu16 = _mm512_add_ps(accu16, _mm512_mul_ps(yiv, x)); + accu16 = _mm512_fmadd_ps(yiv, x, accu16); } - void add_16_components_2(__m512 x1, __m512 x2) { - accu16 = _mm512_add_ps(accu16, _mm512_mul_ps(x1, x2)); + FAISS_ALWAYS_INLINE void add_16_components_2(__m512 x1, __m512 x2) { + accu16 = _mm512_fmadd_ps(x1, x2, accu16); } - float result_16() { - __m256 sum0 = _mm512_extractf32x8_ps(accu16, 1) + _mm512_extractf32x8_ps(accu16, 0); - __m256 sum1 = _mm256_hadd_ps(sum0, sum0); - __m256 sum2 = _mm256_hadd_ps(sum1, sum1); - // now add the 0th and 4th component - return _mm_cvtss_f32(_mm256_castps256_ps128(sum2)) + - _mm_cvtss_f32(_mm256_extractf128_ps(sum2, 1)); + FAISS_ALWAYS_INLINE float result_16() { + return _mm512_reduce_add_ps(accu16); } }; @@ -419,7 +425,8 @@ struct DCTemplate_avx512 : SQDistanceComputer { return sim.result_16(); } - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) const { + float compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { Similarity sim(nullptr); sim.begin_16(); for (size_t i = 0; i < quant.d; i += 16) { @@ -430,12 +437,12 @@ struct DCTemplate_avx512 : SQDistanceComputer { return sim.result_16(); } - void set_query(const float *x) final { + void set_query(const float* x) final { q = x; } /// compute distance of vector i to current query - float operator() (idx_t i) final { + float operator()(idx_t i) final { return compute_distance(q, codes + i * code_size); } @@ -444,21 +451,19 @@ struct DCTemplate_avx512 : SQDistanceComputer { codes + i * code_size, codes + j * code_size); } - float query_to_code(const uint8_t * code) const override final { + float query_to_code(const uint8_t* code) const override final { return compute_distance(q, code); } void query_to_codes_batch_4( - const uint8_t* __restrict code_0, - const uint8_t* __restrict code_1, - const uint8_t* __restrict code_2, - const uint8_t* __restrict code_3, - float& dis0, - float& dis1, - float& dis2, - float& dis3 - ) const override final { - + const uint8_t* __restrict code_0, + const uint8_t* __restrict code_1, + const uint8_t* __restrict code_2, + const uint8_t* __restrict code_3, + float& dis0, + float& dis1, + float& dis2, + float& dis3) const override final { Similarity sim0(q); Similarity sim1(q); Similarity sim2(q); @@ -497,11 +502,14 @@ struct DCTemplate_avx512 : SQDistanceComputer { float& dis2, float& dis3) override { query_to_codes_batch_4( - codes + idx0 * code_size, - codes + idx1 * code_size, - codes + idx2 * code_size, - codes + idx3 * code_size, - dis0, dis1, dis2, dis3); + codes + idx0 * code_size, + codes + idx1 * code_size, + codes + idx2 * code_size, + codes + idx3 * code_size, + dis0, + dis1, + dis2, + dis3); } }; @@ -536,15 +544,16 @@ struct DistanceComputerByte_avx512 : SQDistanceComputer { DistanceComputerByte_avx512(int d, const std::vector&) : d(d), tmp(d) {} - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) const { + int compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { // __m256i accu = _mm256_setzero_ps (); __m512i accu = _mm512_setzero_si512(); for (int i = 0; i < d; i += 32) { // load 32 bytes, convert to 16 uint16_t - __m512i c1 = _mm512_cvtepu8_epi16 - (_mm256_loadu_si256((__m256i*)(code1 + i))); - __m512i c2 = _mm512_cvtepu8_epi16 - (_mm256_loadu_si256((__m256i*)(code2 + i))); + __m512i c1 = _mm512_cvtepu8_epi16( + _mm256_loadu_si256((__m256i*)(code1 + i))); + __m512i c2 = _mm512_cvtepu8_epi16( + _mm256_loadu_si256((__m256i*)(code2 + i))); __m512i prod32; if (Sim::metric_type == METRIC_INNER_PRODUCT) { prod32 = _mm512_madd_epi16(c1, c2); @@ -554,13 +563,7 @@ struct DistanceComputerByte_avx512 : SQDistanceComputer { } accu = _mm512_add_epi32(accu, prod32); } - __m128i sum = _mm512_extracti32x4_epi32(accu, 0); - sum = _mm_add_epi32(sum, _mm512_extracti32x4_epi32(accu, 1)); - sum = _mm_add_epi32(sum, _mm512_extracti32x4_epi32(accu, 2)); - sum = _mm_add_epi32(sum, _mm512_extracti32x4_epi32(accu, 3)); - sum = _mm_hadd_epi32(sum, sum); - sum = _mm_hadd_epi32(sum, sum); - return _mm_cvtsi128_si32(sum); + return _mm512_reduce_add_epi32(accu); } void set_query(const float* x) final { @@ -580,7 +583,7 @@ struct DistanceComputerByte_avx512 : SQDistanceComputer { } /// compute distance of vector i to current query - float operator() (idx_t i) final { + float operator()(idx_t i) final { return compute_distance(q, codes + i * code_size); } @@ -620,19 +623,28 @@ SQDistanceComputer* select_distance_computer_avx512( case QuantizerType::QT_8bit: return new DCTemplate_avx512< - QuantizerTemplate_avx512, + QuantizerTemplate_avx512< + Codec8bit_avx512, + false, + SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained); case QuantizerType::QT_6bit: return new DCTemplate_avx512< - QuantizerTemplate_avx512, + QuantizerTemplate_avx512< + Codec6bit_avx512, + false, + SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained); case QuantizerType::QT_4bit: return new DCTemplate_avx512< - QuantizerTemplate_avx512, + QuantizerTemplate_avx512< + Codec4bit_avx512, + false, + SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained); @@ -644,7 +656,8 @@ SQDistanceComputer* select_distance_computer_avx512( case QuantizerType::QT_8bit_direct: if (d % 16 == 0) { - return new DistanceComputerByte_avx512(d, trained); + return new DistanceComputerByte_avx512( + d, trained); } else { return new DCTemplate_avx512< Quantizer8bitDirect_avx512, @@ -663,7 +676,8 @@ InvertedListScanner* sel2_InvertedListScanner_avx512( bool store_pairs, const IDSelector* sel, bool r) { - return sel2_InvertedListScanner(sq, quantizer, store_pairs, sel, r); + return sel2_InvertedListScanner( + sq, quantizer, store_pairs, sel, r); } template @@ -676,7 +690,8 @@ InvertedListScanner* sel12_InvertedListScanner_avx512( constexpr int SIMDWIDTH = Similarity::simdwidth; using QuantizerClass = QuantizerTemplate_avx512; using DCClass = DCTemplate_avx512; - return sel2_InvertedListScanner_avx512(sq, quantizer, store_pairs, sel, r); + return sel2_InvertedListScanner_avx512( + sq, quantizer, store_pairs, sel, r); } template @@ -689,20 +704,30 @@ InvertedListScanner* sel1_InvertedListScanner_avx512( constexpr int SIMDWIDTH = Similarity::simdwidth; switch (sq->qtype) { case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner_avx512( - sq, quantizer, store_pairs, sel, r); + return sel12_InvertedListScanner_avx512< + Similarity, + Codec8bit_avx512, + true>(sq, quantizer, store_pairs, sel, r); case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner_avx512( - sq, quantizer, store_pairs, sel, r); + return sel12_InvertedListScanner_avx512< + Similarity, + Codec4bit_avx512, + true>(sq, quantizer, store_pairs, sel, r); case QuantizerType::QT_8bit: - return sel12_InvertedListScanner_avx512( - sq, quantizer, store_pairs, sel, r); + return sel12_InvertedListScanner_avx512< + Similarity, + Codec8bit_avx512, + false>(sq, quantizer, store_pairs, sel, r); case QuantizerType::QT_4bit: - return sel12_InvertedListScanner_avx512( - sq, quantizer, store_pairs, sel, r); + return sel12_InvertedListScanner_avx512< + Similarity, + Codec4bit_avx512, + false>(sq, quantizer, store_pairs, sel, r); case QuantizerType::QT_6bit: - return sel12_InvertedListScanner_avx512( - sq, quantizer, store_pairs, sel, r); + return sel12_InvertedListScanner_avx512< + Similarity, + Codec6bit_avx512, + false>(sq, quantizer, store_pairs, sel, r); case QuantizerType::QT_fp16: return sel2_InvertedListScanner_avx512, diff --git a/thirdparty/faiss/faiss/impl/platform_macros.h b/thirdparty/faiss/faiss/impl/platform_macros.h index 3315d0405..aeafb9531 100644 --- a/thirdparty/faiss/faiss/impl/platform_macros.h +++ b/thirdparty/faiss/faiss/impl/platform_macros.h @@ -82,6 +82,8 @@ inline int __builtin_clzll(uint64_t x) { #define __F16C__ 1 #endif +#define FAISS_ALWAYS_INLINE __forceinline + #else /******************************************************* * Linux and OSX @@ -98,6 +100,8 @@ inline int __builtin_clzll(uint64_t x) { #define ALIGNED(x) __attribute__((aligned(x))) #endif +#define FAISS_ALWAYS_INLINE __attribute__((always_inline)) inline + #endif #if defined(__GNUC__) || defined(__clang__)