From 86410d5762e5188451f8accb9a53fc96171aadcf Mon Sep 17 00:00:00 2001
From: usamoi <usamoi@outlook.com>
Date: Thu, 29 Aug 2024 23:49:47 +0800
Subject: [PATCH] test: strengthen AVX2/AVX512 code tests and remove flasky
 bvector test

Signed-off-by: usamoi <usamoi@outlook.com>
---
 Cargo.lock                           |   1 +
 crates/base/src/lib.rs               |   1 +
 crates/{common => base}/src/rand.rs  |   9 ++
 crates/base/src/scalar/f16.rs        | 166 +++++++++++++-------
 crates/base/src/scalar/f32.rs        | 226 +++++++++++++++------------
 crates/base/src/vector/svect.rs      |   2 +-
 crates/common/src/lib.rs             |   1 -
 crates/common/src/sample.rs          |   2 +-
 crates/detect_macros/src/lib.rs      |   5 +
 crates/k_means/Cargo.toml            |   1 +
 crates/k_means/src/elkan.rs          |   3 +-
 crates/k_means/src/lloyd.rs          |   3 +-
 crates/quantization/src/quantize.rs  |   2 +-
 crates/rabitq/src/quant/quantizer.rs |   1 +
 tests/sqllogictest/bvector.slt       |   5 -
 15 files changed, 264 insertions(+), 164 deletions(-)
 rename crates/{common => base}/src/rand.rs (60%)
diff --git a/Cargo.lock b/Cargo.lock
index aa0a182c3..a495bc2c8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1660,6 +1660,7 @@ version = "0.0.0"
 dependencies = [
  "base",
  "common",
+ "half 2.4.1",
  "rand",
  "smawk",
  "stoppable_rayon",
diff --git a/crates/base/src/lib.rs b/crates/base/src/lib.rs
index d78ab5911..d776e8b73 100644
--- a/crates/base/src/lib.rs
+++ b/crates/base/src/lib.rs
@@ -14,6 +14,7 @@ pub mod distance;
 pub mod index;
 pub mod operator;
 pub mod pod;
+pub mod rand;
 pub mod scalar;
 pub mod search;
 pub mod vector;
diff --git a/crates/common/src/rand.rs b/crates/base/src/rand.rs
similarity index 60%
rename from crates/common/src/rand.rs
rename to crates/base/src/rand.rs
index 75b0bd044..2634fcaa1 100644
--- a/crates/common/src/rand.rs
+++ b/crates/base/src/rand.rs
@@ -9,3 +9,12 @@ where
         _ => unreachable!(),
     }
 }
+
+pub fn sample_u32_sorted<R>(rng: &mut R, length: u32, amount: u32) -> Vec<u32>
+where
+    R: Rng + ?Sized,
+{
+    let mut x = sample_u32(rng, length, amount);
+    x.sort();
+    x
+}
diff --git a/crates/base/src/scalar/f16.rs b/crates/base/src/scalar/f16.rs
index 46838b9f3..2e4e062ea 100644
--- a/crates/base/src/scalar/f16.rs
+++ b/crates/base/src/scalar/f16.rs
@@ -263,22 +263,32 @@ mod reduce_sum_of_xy {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_xy_v4_avx512fp16_test() {
+        use rand::Rng;
         const EPSILON: f32 = 2.0;
         detect::init();
         if !detect::v4_avx512fp16::detect() {
             println!("test {} ... skipped (v4_avx512fp16)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4000;
-            let lhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let rhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let specialized = unsafe { reduce_sum_of_xy_v4_avx512fp16(&lhs, &rhs) };
-            let fallback = unsafe { reduce_sum_of_xy_fallback(&lhs, &rhs) };
-            assert!(
-                (specialized - fallback).abs() < EPSILON,
-                "specialized = {specialized}, fallback = {fallback}."
-            );
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let lhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            let rhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
+                let lhs = &lhs[..z];
+                let rhs = &rhs[..z];
+                let specialized = unsafe { reduce_sum_of_xy_v4_avx512fp16(lhs, rhs) };
+                let fallback = unsafe { reduce_sum_of_xy_fallback(lhs, rhs) };
+                assert!(
+                    (specialized - fallback).abs() < EPSILON,
+                    "specialized = {specialized}, fallback = {fallback}."
+                );
+            }
         }
     }
 
@@ -313,16 +323,22 @@ mod reduce_sum_of_xy {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_xy_v4_test() {
+        use rand::Rng;
         const EPSILON: f32 = 2.0;
         detect::init();
         if !detect::v4::detect() {
             println!("test {} ... skipped (v4)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4000;
-            let lhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let rhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let lhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            let rhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
             let specialized = unsafe { reduce_sum_of_xy_v4(&lhs, &rhs) };
             let fallback = unsafe { reduce_sum_of_xy_fallback(&lhs, &rhs) };
             assert!(
@@ -367,22 +383,32 @@ mod reduce_sum_of_xy {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_xy_v3_test() {
+        use rand::Rng;
         const EPSILON: f32 = 2.0;
         detect::init();
         if !detect::v3::detect() {
             println!("test {} ... skipped (v3)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4000;
-            let lhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let rhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let specialized = unsafe { reduce_sum_of_xy_v3(&lhs, &rhs) };
-            let fallback = unsafe { reduce_sum_of_xy_fallback(&lhs, &rhs) };
-            assert!(
-                (specialized - fallback).abs() < EPSILON,
-                "specialized = {specialized}, fallback = {fallback}."
-            );
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let lhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            let rhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
+                let lhs = &lhs[..z];
+                let rhs = &rhs[..z];
+                let specialized = unsafe { reduce_sum_of_xy_v3(lhs, rhs) };
+                let fallback = unsafe { reduce_sum_of_xy_fallback(lhs, rhs) };
+                assert!(
+                    (specialized - fallback).abs() < EPSILON,
+                    "specialized = {specialized}, fallback = {fallback}."
+                );
+            }
         }
     }
 
@@ -434,22 +460,32 @@ mod reduce_sum_of_d2 {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_d2_v4_avx512fp16_test() {
-        const EPSILON: f32 = 2.0;
+        use rand::Rng;
+        const EPSILON: f32 = 6.0;
         detect::init();
         if !detect::v4_avx512fp16::detect() {
             println!("test {} ... skipped (v4_avx512fp16)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4000;
-            let lhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let rhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let specialized = unsafe { reduce_sum_of_d2_v4_avx512fp16(&lhs, &rhs) };
-            let fallback = unsafe { reduce_sum_of_d2_fallback(&lhs, &rhs) };
-            assert!(
-                (specialized - fallback).abs() < EPSILON,
-                "specialized = {specialized}, fallback = {fallback}."
-            );
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let lhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            let rhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
+                let lhs = &lhs[..z];
+                let rhs = &rhs[..z];
+                let specialized = unsafe { reduce_sum_of_d2_v4_avx512fp16(lhs, rhs) };
+                let fallback = unsafe { reduce_sum_of_d2_fallback(lhs, rhs) };
+                assert!(
+                    (specialized - fallback).abs() < EPSILON,
+                    "specialized = {specialized}, fallback = {fallback}."
+                );
+            }
         }
     }
 
@@ -486,22 +522,32 @@ mod reduce_sum_of_d2 {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_d2_v4_test() {
+        use rand::Rng;
         const EPSILON: f32 = 2.0;
         detect::init();
         if !detect::v4::detect() {
             println!("test {} ... skipped (v4)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4000;
-            let lhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let rhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let specialized = unsafe { reduce_sum_of_d2_v4(&lhs, &rhs) };
-            let fallback = unsafe { reduce_sum_of_d2_fallback(&lhs, &rhs) };
-            assert!(
-                (specialized - fallback).abs() < EPSILON,
-                "specialized = {specialized}, fallback = {fallback}."
-            );
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let lhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            let rhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
+                let lhs = &lhs[..z];
+                let rhs = &rhs[..z];
+                let specialized = unsafe { reduce_sum_of_d2_v4(lhs, rhs) };
+                let fallback = unsafe { reduce_sum_of_d2_fallback(lhs, rhs) };
+                assert!(
+                    (specialized - fallback).abs() < EPSILON,
+                    "specialized = {specialized}, fallback = {fallback}."
+                );
+            }
         }
     }
 
@@ -542,22 +588,32 @@ mod reduce_sum_of_d2 {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_d2_v3_test() {
+        use rand::Rng;
         const EPSILON: f32 = 2.0;
         detect::init();
         if !detect::v3::detect() {
             println!("test {} ... skipped (v3)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4000;
-            let lhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let rhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let specialized = unsafe { reduce_sum_of_d2_v3(&lhs, &rhs) };
-            let fallback = unsafe { reduce_sum_of_d2_fallback(&lhs, &rhs) };
-            assert!(
-                (specialized - fallback).abs() < EPSILON,
-                "specialized = {specialized}, fallback = {fallback}."
-            );
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let lhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            let rhs = (0..n)
+                .map(|_| f16::from_f32(rng.gen_range(-1.0..=1.0)))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
+                let lhs = &lhs[..z];
+                let rhs = &rhs[..z];
+                let specialized = unsafe { reduce_sum_of_d2_v3(lhs, rhs) };
+                let fallback = unsafe { reduce_sum_of_d2_fallback(lhs, rhs) };
+                assert!(
+                    (specialized - fallback).abs() < EPSILON,
+                    "specialized = {specialized}, fallback = {fallback}."
+                );
+            }
         }
     }
 
diff --git a/crates/base/src/scalar/f32.rs b/crates/base/src/scalar/f32.rs
index a586c2e5a..5b5ac4d98 100644
--- a/crates/base/src/scalar/f32.rs
+++ b/crates/base/src/scalar/f32.rs
@@ -211,16 +211,20 @@ mod reduce_sum_of_x {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_x_v4_test() {
-        const EPSILON: f32 = 0.01;
+        use rand::Rng;
+        const EPSILON: f32 = 0.008;
         detect::init();
         if !detect::v4::detect() {
             println!("test {} ... skipped (v4)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4010;
-            let this = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            for z in 3990..4010 {
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let this = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
                 let this = &this[..z];
                 let specialized = unsafe { reduce_sum_of_x_v4(&this) };
                 let fallback = unsafe { reduce_sum_of_x_fallback(&this) };
@@ -268,16 +272,20 @@ mod reduce_sum_of_x {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_x_v3_test() {
-        const EPSILON: f32 = 0.01;
+        use rand::Rng;
+        const EPSILON: f32 = 0.008;
         detect::init();
         if !detect::v3::detect() {
             println!("test {} ... skipped (v3)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4010;
-            let this = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            for z in 3990..4010 {
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let this = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
                 let this = &this[..z];
                 let specialized = unsafe { reduce_sum_of_x_v3(this) };
                 let fallback = unsafe { reduce_sum_of_x_fallback(this) };
@@ -327,16 +335,20 @@ mod reduce_sum_of_x2 {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_x2_v4_test() {
-        const EPSILON: f32 = 0.01;
+        use rand::Rng;
+        const EPSILON: f32 = 0.006;
         detect::init();
         if !detect::v4::detect() {
             println!("test {} ... skipped (v4)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4010;
-            let this = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            for z in 3990..4010 {
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let this = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
                 let this = &this[..z];
                 let specialized = unsafe { reduce_sum_of_x2_v4(&this) };
                 let fallback = unsafe { reduce_sum_of_x2_fallback(&this) };
@@ -384,16 +396,20 @@ mod reduce_sum_of_x2 {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_x2_v3_test() {
-        const EPSILON: f32 = 0.01;
+        use rand::Rng;
+        const EPSILON: f32 = 0.006;
         detect::init();
         if !detect::v3::detect() {
             println!("test {} ... skipped (v3)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4010;
-            let this = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            for z in 3990..4010 {
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let this = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
                 let this = &this[..z];
                 let specialized = unsafe { reduce_sum_of_x2_v3(this) };
                 let fallback = unsafe { reduce_sum_of_x2_fallback(this) };
@@ -451,31 +467,24 @@ mod reduce_min_max_of_x {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_min_max_of_x_v4_test() {
-        const EPSILON: f32 = 0.0001;
+        use rand::Rng;
         detect::init();
         if !detect::v4::detect() {
             println!("test {} ... skipped (v4)", module_path!());
             return;
         }
-        for _ in 0..300 {
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
             let n = 200;
-            let x = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
+            let x = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
             for z in 50..200 {
                 let x = &x[..z];
                 let specialized = unsafe { reduce_min_max_of_x_v4(x) };
                 let fallback = unsafe { reduce_min_max_of_x_fallback(x) };
-                assert!(
-                    (specialized.0 - fallback.0).abs() < EPSILON,
-                    "min: specialized = {}, fallback = {}.",
-                    specialized.0,
-                    fallback.0,
-                );
-                assert!(
-                    (specialized.1 - fallback.1).abs() < EPSILON,
-                    "max: specialized = {}, fallback = {}.",
-                    specialized.1,
-                    fallback.1,
-                );
+                assert_eq!(specialized.0, fallback.0);
+                assert_eq!(specialized.1, fallback.1);
             }
         }
     }
@@ -515,31 +524,24 @@ mod reduce_min_max_of_x {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_min_max_of_x_v3_test() {
-        const EPSILON: f32 = 0.0001;
+        use rand::Rng;
         detect::init();
         if !detect::v3::detect() {
             println!("test {} ... skipped (v3)", module_path!());
             return;
         }
-        for _ in 0..300 {
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
             let n = 200;
-            let x = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
+            let x = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
             for z in 50..200 {
                 let x = &x[..z];
                 let specialized = unsafe { reduce_min_max_of_x_v3(x) };
                 let fallback = unsafe { reduce_min_max_of_x_fallback(x) };
-                assert!(
-                    (specialized.0 - fallback.0).abs() < EPSILON,
-                    "specialized = {}, fallback = {}.",
-                    specialized.0,
-                    fallback.0,
-                );
-                assert!(
-                    (specialized.1 - fallback.1).abs() < EPSILON,
-                    "specialized = {}, fallback = {}.",
-                    specialized.1,
-                    fallback.1,
-                );
+                assert_eq!(specialized.0, fallback.0,);
+                assert_eq!(specialized.1, fallback.1,);
             }
         }
     }
@@ -589,17 +591,23 @@ mod reduce_sum_of_xy {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_xy_v4_test() {
-        const EPSILON: f32 = 2.0;
+        use rand::Rng;
+        const EPSILON: f32 = 0.004;
         detect::init();
         if !detect::v4::detect() {
             println!("test {} ... skipped (v4)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4010;
-            let lhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let rhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            for z in 3990..4010 {
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let lhs = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            let rhs = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
                 let lhs = &lhs[..z];
                 let rhs = &rhs[..z];
                 let specialized = unsafe { reduce_sum_of_xy_v4(lhs, rhs) };
@@ -656,17 +664,23 @@ mod reduce_sum_of_xy {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_xy_v3_test() {
-        const EPSILON: f32 = 2.0;
+        use rand::Rng;
+        const EPSILON: f32 = 0.004;
         detect::init();
         if !detect::v3::detect() {
             println!("test {} ... skipped (v3)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4010;
-            let lhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let rhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            for z in 3990..4010 {
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let lhs = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            let rhs = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
                 let lhs = &lhs[..z];
                 let rhs = &rhs[..z];
                 let specialized = unsafe { reduce_sum_of_xy_v3(lhs, rhs) };
@@ -725,17 +739,23 @@ mod reduce_sum_of_d2 {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_d2_v4_test() {
-        const EPSILON: f32 = 2.0;
+        use rand::Rng;
+        const EPSILON: f32 = 0.02;
         detect::init();
         if !detect::v4::detect() {
             println!("test {} ... skipped (v4)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4010;
-            let lhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let rhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            for z in 3990..4010 {
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let lhs = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            let rhs = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
                 let lhs = &lhs[..z];
                 let rhs = &rhs[..z];
                 let specialized = unsafe { reduce_sum_of_d2_v4(lhs, rhs) };
@@ -795,17 +815,23 @@ mod reduce_sum_of_d2 {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_d2_v3_test() {
-        const EPSILON: f32 = 2.0;
+        use rand::Rng;
+        const EPSILON: f32 = 0.02;
         detect::init();
         if !detect::v3::detect() {
             println!("test {} ... skipped (v3)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let n = 4010;
-            let lhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            let rhs = (0..n).map(|_| rand::random::<_>()).collect::<Vec<_>>();
-            for z in 3990..4010 {
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let n = 4016;
+            let lhs = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            let rhs = (0..n)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            for z in 3984..4016 {
                 let lhs = &lhs[..z];
                 let rhs = &rhs[..z];
                 let specialized = unsafe { reduce_sum_of_d2_v3(lhs, rhs) };
@@ -881,15 +907,25 @@ mod reduce_sum_of_sparse_xy {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_sparse_xy_v4_test() {
-        const EPSILON: f32 = 5e-4;
+        use rand::Rng;
+        const EPSILON: f32 = 0.000001;
         detect::init();
         if !detect::v4::detect() {
             println!("test {} ... skipped (v4)", module_path!());
             return;
         }
-        for _ in 0..300 {
-            let (lidx, lval) = super::random_svector(300);
-            let (ridx, rval) = super::random_svector(350);
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let lm = 300;
+            let lidx = crate::rand::sample_u32_sorted(&mut rng, 10000, lm);
+            let lval = (0..lm)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            let rm = 350;
+            let ridx = crate::rand::sample_u32_sorted(&mut rng, 10000, rm);
+            let rval = (0..rm)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
             let specialized = unsafe { reduce_sum_of_sparse_xy_v4(&lidx, &lval, &ridx, &rval) };
             let fallback = unsafe { reduce_sum_of_sparse_xy_fallback(&lidx, &lval, &ridx, &rval) };
             assert!(
@@ -1010,15 +1046,25 @@ mod reduce_sum_of_sparse_d2 {
     #[cfg(all(target_arch = "x86_64", test))]
     #[test]
     fn reduce_sum_of_sparse_d2_v4_test() {
-        const EPSILON: f32 = 5e-4;
+        use rand::Rng;
+        const EPSILON: f32 = 0.0004;
         detect::init();
         if !detect::v4::detect() {
             println!("test {} ... skipped (v4)", module_path!());
             return;
         }
-        for _ in 0..30 {
-            let (lidx, lval) = super::random_svector(300);
-            let (ridx, rval) = super::random_svector(350);
+        let mut rng = rand::thread_rng();
+        for _ in 0..256 {
+            let lm = 300;
+            let lidx = crate::rand::sample_u32_sorted(&mut rng, 10000, lm);
+            let lval = (0..lm)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
+            let rm = 350;
+            let ridx = crate::rand::sample_u32_sorted(&mut rng, 10000, rm);
+            let rval = (0..rm)
+                .map(|_| rng.gen_range(-1.0..=1.0))
+                .collect::<Vec<_>>();
             let specialized = unsafe { reduce_sum_of_sparse_d2_v4(&lidx, &lval, &ridx, &rval) };
             let fallback = unsafe { reduce_sum_of_sparse_d2_fallback(&lidx, &lval, &ridx, &rval) };
             assert!(
@@ -1063,19 +1109,3 @@ mod reduce_sum_of_sparse_d2 {
         d2
     }
 }
-
-#[cfg(all(target_arch = "x86_64", test))]
-fn random_svector(len: usize) -> (Vec<u32>, Vec<f32>) {
-    use rand::Rng;
-    let mut rng = rand::thread_rng();
-    let mut indexes = rand::seq::index::sample(&mut rand::thread_rng(), 10000, len)
-        .into_iter()
-        .map(|x| x as _)
-        .collect::<Vec<u32>>();
-    indexes.sort();
-    let values: Vec<f32> = std::iter::from_fn(|| Some(rng.gen_range(-1.0..1.0)))
-        .filter(|&x| x != 0.0)
-        .take(indexes.len())
-        .collect::<Vec<f32>>();
-    (indexes, values)
-}
diff --git a/crates/base/src/vector/svect.rs b/crates/base/src/vector/svect.rs
index f7e0a3747..b04f2ab51 100644
--- a/crates/base/src/vector/svect.rs
+++ b/crates/base/src/vector/svect.rs
@@ -34,7 +34,7 @@ impl<S: ScalarLike> SVectOwned<S> {
         if len != 0 && !(indexes[len - 1] < dims) {
             return None;
         }
-        // FIXME: SIMD
+        // FIXME: add manually-implemented SIMD version
         for i in 0..len {
             if values[i] == S::zero() {
                 return None;
diff --git a/crates/common/src/lib.rs b/crates/common/src/lib.rs
index c21a90efb..b7ed1419d 100644
--- a/crates/common/src/lib.rs
+++ b/crates/common/src/lib.rs
@@ -3,7 +3,6 @@ pub mod dir_ops;
 pub mod file_atomic;
 pub mod json;
 pub mod mmap_array;
-pub mod rand;
 pub mod remap;
 pub mod sample;
 pub mod variants;
diff --git a/crates/common/src/sample.rs b/crates/common/src/sample.rs
index 7b6ec54e6..06fcd0c32 100644
--- a/crates/common/src/sample.rs
+++ b/crates/common/src/sample.rs
@@ -8,7 +8,7 @@ pub fn sample<S: ScalarLike, R: AsRef<[S]>>(
     g: impl Fn(u32) -> R,
 ) -> Vec2<S> {
     let m = std::cmp::min(n, m);
-    let f = super::rand::sample_u32(&mut rand::thread_rng(), n, m);
+    let f = base::rand::sample_u32(&mut rand::thread_rng(), n, m);
     let mut samples = Vec2::zeros((m as usize, d as usize));
     for i in 0..m {
         samples[(i as usize,)].copy_from_slice(g(f[i as usize]).as_ref());
diff --git a/crates/detect_macros/src/lib.rs b/crates/detect_macros/src/lib.rs
index 808a5a6e1..ea4ebac9f 100644
--- a/crates/detect_macros/src/lib.rs
+++ b/crates/detect_macros/src/lib.rs
@@ -291,6 +291,11 @@ pub fn main(_: proc_macro::TokenStream) -> proc_macro::TokenStream {
                     true #(&& std::arch::is_aarch64_feature_detected!(#target_features))*
                 }
 
+                #[cfg(target_arch = "riscv64")]
+                pub fn test() -> bool {
+                    true #(&& std::arch::is_riscv_feature_detected!(#target_features))*
+                }
+
                 pub(crate) fn init() {
                     ATOMIC.store(test(), Ordering::Relaxed);
                 }
diff --git a/crates/k_means/Cargo.toml b/crates/k_means/Cargo.toml
index 2d639bc46..0da728842 100644
--- a/crates/k_means/Cargo.toml
+++ b/crates/k_means/Cargo.toml
@@ -4,6 +4,7 @@ version.workspace = true
 edition.workspace = true
 
 [dependencies]
+half.workspace = true
 rand.workspace = true
 
 base = { path = "../base" }
diff --git a/crates/k_means/src/elkan.rs b/crates/k_means/src/elkan.rs
index 51e996d0b..dd9a84726 100644
--- a/crates/k_means/src/elkan.rs
+++ b/crates/k_means/src/elkan.rs
@@ -1,5 +1,6 @@
 use base::scalar::*;
 use common::vec2::Vec2;
+use half::f16;
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
 use std::ops::{Index, IndexMut};
@@ -17,7 +18,7 @@ pub struct ElkanKMeans<S> {
     first: bool,
 }
 
-const DELTA: f32 = 1.0 / 1024.0;
+const DELTA: f32 = f16::EPSILON.to_f32_const();
 
 impl<S: ScalarLike> ElkanKMeans<S> {
     pub fn new(c: usize, samples: Vec2<S>, is_spherical: bool) -> Self {
diff --git a/crates/k_means/src/lloyd.rs b/crates/k_means/src/lloyd.rs
index 5eaa7f544..b32e95c2d 100644
--- a/crates/k_means/src/lloyd.rs
+++ b/crates/k_means/src/lloyd.rs
@@ -1,5 +1,6 @@
 use base::scalar::*;
 use common::vec2::Vec2;
+use half::f16;
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
 use rayon::iter::IntoParallelRefMutIterator;
@@ -16,7 +17,7 @@ pub struct LloydKMeans<S> {
     samples: Vec2<S>,
 }
 
-const DELTA: f32 = 1.0 / 1024.0;
+const DELTA: f32 = f16::EPSILON.to_f32_const();
 
 impl<S: ScalarLike> LloydKMeans<S> {
     pub fn new(c: usize, samples: Vec2<S>, is_spherical: bool) -> Self {
diff --git a/crates/quantization/src/quantize.rs b/crates/quantization/src/quantize.rs
index 0b11d976e..78f1e76fd 100644
--- a/crates/quantization/src/quantize.rs
+++ b/crates/quantization/src/quantize.rs
@@ -163,7 +163,7 @@ pub fn dequantize(sum_1: u32, k: f32, b: f32, sum_x: u16) -> f32 {
 }
 
 // FIXME: the result may not fit in an u16
-// FIXME: generated code for AVX512 is bad, and that for AVX2 is not good, so rewrite it
+// FIXME: add manually-implemented SIMD version
 #[detect::multiversion(v4, v3, v2, neon, fallback)]
 pub fn reduce_sum_of_x(vector: &[u8]) -> u16 {
     let n = vector.len();
diff --git a/crates/rabitq/src/quant/quantizer.rs b/crates/rabitq/src/quant/quantizer.rs
index 5ae5c1aab..9855e7402 100644
--- a/crates/rabitq/src/quant/quantizer.rs
+++ b/crates/rabitq/src/quant/quantizer.rs
@@ -43,6 +43,7 @@ impl<O: OperatorRabitq> RabitqQuantizer<O> {
     }
 
     pub fn encode(&self, vector: &[f32]) -> (f32, f32, f32, f32, Vec<u8>) {
+        // FIXME: add manually-implemented SIMD version
         let sum_of_abs_x = vector.iter().map(|x| x.abs()).sum::<f32>();
         let sum_of_x_2 = f32::reduce_sum_of_x2(vector);
         let dis_u = sum_of_x_2.sqrt();
diff --git a/tests/sqllogictest/bvector.slt b/tests/sqllogictest/bvector.slt
index 2088d3601..a96a76235 100644
--- a/tests/sqllogictest/bvector.slt
+++ b/tests/sqllogictest/bvector.slt
@@ -29,11 +29,6 @@ SELECT COUNT(1) FROM (SELECT 1 FROM t ORDER BY val <#> '[0,1,0,1,0,1,0,1,0,1]'::
 ----
 10
 
-query I
-SELECT COUNT(1) FROM (SELECT 1 FROM t ORDER BY val <~> '[0,1,0,1,0,1,0,1,0,1]'::bvector limit 10) t2;
-----
-10
-
 statement ok
 DROP TABLE t;