-
Notifications
You must be signed in to change notification settings - Fork 78
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
compensate for the missing acceleration functions in ARM NEON. #922
Conversation
@Presburger 🔍 Important: PR Classification Needed! For efficient project management and a seamless review process, it's essential to classify your PR correctly. Here's how:
For any PR outside the kind/improvement category, ensure you link to the associated issue using the format: “issue: #”. Thanks for your efforts and contribution to the community!. |
39e6861
to
07d5e5e
Compare
Signed-off-by: yusheng.ma <yusheng.ma@zilliz.com>
Codecov ReportAll modified and coverable lines are covered by tests ✅
Additional details and impacted files@@ Coverage Diff @@
## main #922 +/- ##
=========================================
+ Coverage 0 79.25% +79.25%
=========================================
Files 0 81 +81
Lines 0 6350 +6350
=========================================
+ Hits 0 5033 +5033
- Misses 0 1317 +1317 |
// Convert back to float | ||
return vreinterpretq_f32_u32(rounded_bits); | ||
} | ||
|
||
float | ||
fvec_inner_product_neon(const float* x, const float* y, size_t d) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there's no need to write a manual code for this. Please use the following:
// trust the compiler to unroll this properly
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_inner_product_neon(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
res += x[i] * y[i];
}
return res;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"I compared the automatically generated NEON assembly from the compiler in the no-SVE scenario with handwritten NEON intrinsic assembly. Based on my experience, the handwritten version tends to be more efficient. I used ARM64 GCC 13.1 for analysis. Additionally, relying on compiler expansion requires more precise control over compilation commands, as the compiler is likely to generate SVE instructions. In China, many ARM processors currently cannot execute SVE commands."
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
compiler make
fvec_inner_product_neon(float const*, float const*, unsigned long):
cbz x2, .L10
sub x3, x2, #1
cmp x3, 14
bls .L11
lsr x7, x2, 4
movi v2.4s, 0
mov x12, x0
lsl x5, x7, 6
add x8, x0, x7, lsl 6
mov x4, x1
sub x6, x5, #64
lsr x9, x6, 6
add x10, x9, 1
mov v18.16b, v2.16b
mov v3.16b, v2.16b
ands x11, x10, 3
mov v19.16b, v2.16b
beq .L4
cmp x11, 1
beq .L30
cmp x11, 2
beq .L31
ldp q4, q6, [x0, 16]
ldp q0, q16, [x1, 16]
ldr q5, [x0, 48]
ldr q20, [x1, 48]
ldr q7, [x12], 64
ldr q17, [x4], 64
fmla v3.4s, v4.4s, v0.4s
fmla v18.4s, v6.4s, v16.4s
fmla v19.4s, v7.4s, v17.4s
fmla v2.4s, v5.4s, v20.4s
.L31:
ldp q21, q22, [x12, 16]
ldp q24, q25, [x4, 16]
ldr q23, [x12, 48]
ldr q27, [x4, 48]
ldr q26, [x12], 64
ldr q28, [x4], 64
fmla v3.4s, v21.4s, v24.4s
fmla v18.4s, v22.4s, v25.4s
fmla v2.4s, v23.4s, v27.4s
fmla v19.4s, v26.4s, v28.4s
.L30:
ldp q29, q30, [x12, 16]
ldp q1, q7, [x4, 16]
ldr q31, [x12, 48]
ldr q6, [x4, 48]
ldr q5, [x12], 64
ldr q4, [x4], 64
fmla v3.4s, v29.4s, v1.4s
fmla v18.4s, v30.4s, v7.4s
fmla v2.4s, v31.4s, v6.4s
fmla v19.4s, v5.4s, v4.4s
cmp x12, x8
beq .L40
.L4:
mov x14, x4
mov x13, x12
ldp q17, q20, [x12, 16]
add x12, x12, 256
ldr q16, [x14], 64
ldr q0, [x13], 64
ldp q22, q23, [x4, 16]
add x4, x4, 256
ldr q21, [x12, -208]
ldr q24, [x4, -208]
fmla v19.4s, v0.4s, v16.4s
ldp q25, q26, [x14, 16]
ldp q27, q28, [x13, 16]
fmla v3.4s, v17.4s, v22.4s
ldr q30, [x12, -192]
ldr q31, [x4, -192]
fmla v18.4s, v20.4s, v23.4s
ldr q29, [x13, 48]
fmla v2.4s, v21.4s, v24.4s
ldr q1, [x14, 48]
ldp q7, q6, [x13, 80]
fmla v3.4s, v27.4s, v25.4s
ldr q20, [x14, 64]
ldr q25, [x13, 64]
fmla v18.4s, v28.4s, v26.4s
ldr q5, [x14, 80]
fmla v19.4s, v30.4s, v31.4s
ldr q4, [x14, 96]
ldr q0, [x13, 112]
fmla v2.4s, v29.4s, v1.4s
ldr q16, [x14, 112]
fmla v3.4s, v7.4s, v5.4s
ldr q23, [x12, -64]
ldr q28, [x4, -64]
fmla v18.4s, v6.4s, v4.4s
ldr q17, [x12, -48]
fmla v2.4s, v0.4s, v16.4s
ldr q22, [x4, -48]
ldr q21, [x12, -32]
fmla v19.4s, v25.4s, v20.4s
ldr q27, [x4, -32]
ldr q24, [x4, -16]
fmla v3.4s, v17.4s, v22.4s
ldr q26, [x12, -16]
fmla v19.4s, v23.4s, v28.4s
fmla v18.4s, v21.4s, v27.4s
fmla v2.4s, v26.4s, v24.4s
cmp x12, x8
bne .L4
.L40:
and x3, x2, -16
fadd v2.4s, v18.4s, v2.4s
fadd v3.4s, v19.4s, v3.4s
fadd v1.4s, v2.4s, v3.4s
faddp v18.4s, v1.4s, v1.4s
faddp v0.4s, v18.4s, v18.4s
tst x2, 15
beq .L1
.L3:
sub x15, x2, x3
sub x16, x15, #1
cmp x16, 2
bls .L7
lsl x17, x3, 2
mov v30.16b, v1.16b
lsr x18, x15, 2
add x5, x0, x3, lsl 2
ldr q19, [x1, x17]
add x7, x1, x3, lsl 2
ldr q29, [x0, x17]
fmla v30.4s, v19.4s, v29.4s
cmp x18, 1
beq .L8
ldr q31, [x7, 16]
ldr q1, [x5, 16]
fmla v30.4s, v1.4s, v31.4s
cmp x18, 2
beq .L8
ldr q4, [x7, 32]
ldr q5, [x5, 32]
fmla v30.4s, v5.4s, v4.4s
.L8:
and x8, x15, -4
add x3, x3, x8
faddp v7.4s, v30.4s, v30.4s
faddp v0.4s, v7.4s, v7.4s
tst x15, 3
beq .L1
.L7:
lsl x6, x3, 2
add x9, x3, 1
ldr s6, [x0, x6]
ldr s17, [x1, x6]
fmadd s0, s6, s17, s0
cmp x2, x9
bls .L1
add x10, x6, 4
add x11, x3, 2
ldr s20, [x1, x10]
ldr s16, [x0, x10]
fmadd s0, s20, s16, s0
cmp x2, x11
bls .L1
add x2, x6, 8
ldr s21, [x0, x2]
ldr s22, [x1, x2]
fmadd s0, s21, s22, s0
.L1:
ret
.L10:
movi v0.2s, #0
ret
.L11:
movi v1.4s, 0
mov x3, 0
fmov s0, s1
b .L3
hand write
fvec_inner_product_neon(float const*, float const*, unsigned long):
cmp x2, 15
bls .L11
sub x5, x2, #16
movi v0.4s, 0
add x4, x0, 64
lsr x5, x5, 4
mov x3, x0
add x5, x4, x5, lsl 6
mov x4, x1
.L3:
ld1 {v16.4s - v19.4s}, [x3]
add x3, x3, 64
ld1 {v4.4s - v7.4s}, [x4]
add x4, x4, 64
fmul v2.4s, v19.4s, v7.4s
fmul v1.4s, v17.4s, v5.4s
fmla v2.4s, v6.4s, v18.4s
fmla v1.4s, v16.4s, v4.4s
fadd v1.4s, v1.4s, v2.4s
fadd v0.4s, v0.4s, v1.4s
cmp x5, x3
bne .L3
and x3, x2, 15
.L2:
cmp x3, 7
bls .L4
sub x4, x2, x3
sub x3, x3, #8
add x5, x0, x4, lsl 2
add x4, x1, x4, lsl 2
ld1 {v4.4s - v5.4s}, [x5]
ld1 {v2.4s - v3.4s}, [x4]
fmul v1.4s, v5.4s, v3.4s
fmla v1.4s, v4.4s, v2.4s
fadd v0.4s, v0.4s, v1.4s
.L4:
cmp x3, 3
bls .L5
sub x4, x2, x3
sub x3, x3, #4
lsl x4, x4, 2
ldr q2, [x0, x4]
ldr q1, [x1, x4]
fmla v0.4s, v2.4s, v1.4s
.L5:
cmp x3, 3
beq .L16
movi v1.4s, 0
cmp x3, 2
beq .L17
cbnz x3, .L18
fadd v0.4s, v0.4s, v1.4s
faddp v0.4s, v0.4s, v0.4s
faddp v0.4s, v0.4s, v0.4s
ret
.L16:
movi v1.4s, 0
sub x3, x2, #3
add x4, x0, x3, lsl 2
add x3, x1, x3, lsl 2
mov v2.16b, v1.16b
ld1 {v1.s}[2], [x3]
ld1 {v2.s}[2], [x4]
.L7:
sub x3, x2, #2
add x4, x0, x3, lsl 2
add x3, x1, x3, lsl 2
ld1 {v2.s}[1], [x4]
ld1 {v1.s}[1], [x3]
.L9:
sub x2, x2, #1
add x0, x0, x2, lsl 2
add x2, x1, x2, lsl 2
ld1 {v2.s}[0], [x0]
ld1 {v1.s}[0], [x2]
fmul v1.4s, v2.4s, v1.4s
fadd v0.4s, v0.4s, v1.4s
faddp v0.4s, v0.4s, v0.4s
faddp v0.4s, v0.4s, v0.4s
ret
.L18:
mov v2.16b, v1.16b
b .L9
.L11:
movi v0.4s, 0
mov x3, x2
b .L2
.L17:
mov v2.16b, v1.16b
b .L7
@@ -211,12 +316,99 @@ bf16_vec_inner_product_neon(const knowhere::bf16* x, const knowhere::bf16* y, si | |||
|
|||
float | |||
fvec_L2sqr_neon(const float* x, const float* y, size_t d) { | |||
float32x4_t sum_ = {0.0f, 0.0f, 0.0f, 0.0f}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there's no need to write a manual code for this. Please use the following:
// trust the compiler to unroll this properly
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_L2sqr_neon(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
const float tmp = x[i] - y[i];
res += tmp * tmp;
}
return res;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
||
float | ||
fvec_inner_product_neon_bf16_patch(const float* x, const float* y, size_t d) { | ||
float32x4_t sum_ = vdupq_n_f32(0.0f); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// trust the compiler to unroll this properly
FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
float
fvec_inner_product_neon_bf16_patch(const float* x, const float* y, size_t d) {
size_t i;
float res = 0;
FAISS_PRAGMA_IMPRECISE_LOOP
for (i = 0; i < d; i++) {
res += x[i] * bf16_float(y[i]);
}
return res;
}
FAISS_PRAGMA_IMPRECISE_FUNCTION_END
|
||
float | ||
fvec_L2sqr_neon_bf16_patch(const float* x, const float* y, size_t d) { | ||
float32x4_t sum_ = vdupq_n_f32(0.0f); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same here and in the following
Basically, please refer to distances_avx512.cpp
to see which functions are delegated to a compiler to be SIMD-ed properly
@Presburger what is the compiler version that we're targeting for this diff? Technically, the only possible need for a manual code is if rely on very old C++ compilers |
basically, we need to use the same code as in the ref version, but in a .cpp file that has neon optimizations enabled |
/lgtm |
[APPROVALNOTIFIER] This PR is APPROVED This pull-request has been approved by: alexanderguzhva, Presburger The full list of commands accepted by this bot can be found here. The pull request process is described here
Needs approval from an approver in each of these files:
Approvers can indicate their approval by writing |
/lgtm |
issue: #919