diff --git a/src/simd/simd_f32.rs b/src/simd/simd_f32.rs index 585cfa6..6e34902 100644 --- a/src/simd/simd_f32.rs +++ b/src/simd/simd_f32.rs @@ -12,10 +12,12 @@ use std::arch::x86_64::*; use super::task::{max_index_value, min_index_value}; const XOR_VALUE: i32 = 0x7FFFFFFF; +const BIT_SHIFT: i32 = 31; +#[inline(always)] fn _ord_i32_to_f32(ord_i32: i32) -> f32 { // TODO: more efficient transformation -> can be decreasing order as well - let v = ((ord_i32 >> 31) & XOR_VALUE) ^ ord_i32; + let v = ((ord_i32 >> BIT_SHIFT) & XOR_VALUE) ^ ord_i32; unsafe { std::mem::transmute::(v) } } @@ -28,14 +30,13 @@ mod avx2 { const LANE_SIZE: usize = AVX2::LANE_SIZE_32; const XOR_MASK: __m256i = unsafe { std::mem::transmute([XOR_VALUE; LANE_SIZE]) }; - const BIT_SHIFT: i32 = 31; #[inline(always)] unsafe fn _f32_as_m256i_to_i32ord(f32_as_m256i: __m256i) -> __m256i { // on a scalar: ((v >> 31) & 0x7FFFFFFF) ^ v let sign_bit_shifted = _mm256_srai_epi32(f32_as_m256i, BIT_SHIFT); let sign_bit_masked = _mm256_and_si256(sign_bit_shifted, XOR_MASK); - _mm256_xor_epi32(sign_bit_masked, f32_as_m256i) + _mm256_xor_si256(sign_bit_masked, f32_as_m256i) } #[inline(always)] @@ -55,7 +56,7 @@ mod avx2 { #[inline(always)] unsafe fn _mm_loadu(data: *const f32) -> __m256i { - _f32_as_m256i_to_i32ord(_mm256_loadu_epi32(data as *const i32)) + _f32_as_m256i_to_i32ord(_mm256_loadu_si256(data as *const __m256i)) } #[inline(always)] @@ -259,7 +260,6 @@ mod sse { const LANE_SIZE: usize = SSE::LANE_SIZE_32; const XOR_MASK: __m128i = unsafe { std::mem::transmute([XOR_VALUE; LANE_SIZE]) }; - const BIT_SHIFT: i32 = 31; #[inline(always)] unsafe fn _f32_as_m128i_to_i32ord(f32_as_m128i: __m128i) -> __m128i {