From 77e5059b124def81e097a853688bf7a06df9e2fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Silva?= Date: Sun, 19 May 2024 01:05:12 +0100 Subject: [PATCH 1/4] Add SIMD versions of scrambler and vector multiplication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: João Silva --- configure.ac | 140 ++++++++++++++++-- makefile.in | 97 ++++++++---- src/random/src/scramble.avx.c | 132 +++++++++++++++++ src/random/src/scramble.avx512f.c | 132 +++++++++++++++++ src/random/src/scramble.c | 4 +- src/random/src/scramble.sse.c | 136 +++++++++++++++++ .../{vectorcf_add.port.c => vectorcf_add.c} | 0 src/vector/src/vectorcf_mul.avx.c | 131 ++++++++++++++++ src/vector/src/vectorcf_mul.avx512f.c | 120 +++++++++++++++ .../{vectorcf_mul.port.c => vectorcf_mul.c} | 0 src/vector/src/vectorcf_mul.sse.c | 131 ++++++++++++++++ .../{vectorcf_norm.port.c => vectorcf_norm.c} | 0 .../{vectorcf_trig.port.c => vectorcf_trig.c} | 0 .../src/{vectorf_add.port.c => vectorf_add.c} | 0 .../src/{vectorf_mul.port.c => vectorf_mul.c} | 0 .../{vectorf_norm.port.c => vectorf_norm.c} | 0 .../{vectorf_trig.port.c => vectorf_trig.c} | 0 17 files changed, 982 insertions(+), 41 deletions(-) create mode 100644 src/random/src/scramble.avx.c create mode 100644 src/random/src/scramble.avx512f.c create mode 100644 src/random/src/scramble.sse.c rename src/vector/src/{vectorcf_add.port.c => vectorcf_add.c} (100%) create mode 100644 src/vector/src/vectorcf_mul.avx.c create mode 100644 src/vector/src/vectorcf_mul.avx512f.c rename src/vector/src/{vectorcf_mul.port.c => vectorcf_mul.c} (100%) create mode 100644 src/vector/src/vectorcf_mul.sse.c rename src/vector/src/{vectorcf_norm.port.c => vectorcf_norm.c} (100%) rename src/vector/src/{vectorcf_trig.port.c => vectorcf_trig.c} (100%) rename src/vector/src/{vectorf_add.port.c => vectorf_add.c} (100%) rename src/vector/src/{vectorf_mul.port.c => vectorf_mul.c} (100%) rename src/vector/src/{vectorf_norm.port.c => vectorf_norm.c} (100%) rename src/vector/src/{vectorf_trig.port.c => vectorf_trig.c} (100%) diff --git a/configure.ac b/configure.ac index 72518d1ce..d9c3e714f 100644 --- a/configure.ac +++ b/configure.ac @@ -157,6 +157,15 @@ if test "${enable_simdoverride+set}" = set; then src/dotprod/src/dotprod_crcf.o \ src/dotprod/src/dotprod_rrrf.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" ARCH_OPTION="" else # Check canonical system @@ -174,12 +183,37 @@ else # AVX512 : immintrin.h AX_EXT - if [ test "$ax_cv_have_avx512f_ext" = yes ]; then + if [ test "$ax_cv_have_avx512bw_ext" = yes ]; then # AVX512 extensions MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx512f.o \ src/dotprod/src/dotprod_crcf.avx512f.o \ src/dotprod/src/dotprod_rrrf.avx512f.o \ src/dotprod/src/sumsq.avx512f.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.avx512f.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.avx512f.o" + ARCH_OPTION='-mavx512bw' + elif [ test "$ax_cv_have_avx512f_ext" = yes ]; then + # AVX512 extensions + MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx512f.o \ + src/dotprod/src/dotprod_crcf.avx512f.o \ + src/dotprod/src/dotprod_rrrf.avx512f.o \ + src/dotprod/src/sumsq.avx512f.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.avx512f.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.avx512f.o" ARCH_OPTION='-mavx512f' elif [ test "$ax_cv_have_avx2_ext" = yes ]; then # AVX2 extensions @@ -187,6 +221,15 @@ else src/dotprod/src/dotprod_crcf.avx.o \ src/dotprod/src/dotprod_rrrf.avx.o \ src/dotprod/src/sumsq.avx.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.avx.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.avx.o" ARCH_OPTION='-mavx2' elif [ test "$ax_cv_have_avx_ext" = yes ]; then # AVX extensions @@ -194,6 +237,15 @@ else src/dotprod/src/dotprod_crcf.avx.o \ src/dotprod/src/dotprod_rrrf.avx.o \ src/dotprod/src/sumsq.avx.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.avx.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.sse.o" ARCH_OPTION='-mavx' elif [ test "$ax_cv_have_sse41_ext" = yes ]; then # SSE4.1/2 extensions @@ -201,6 +253,15 @@ else src/dotprod/src/dotprod_crcf.sse.o \ src/dotprod/src/dotprod_rrrf.sse.o \ src/dotprod/src/sumsq.sse.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.sse.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.sse.o" ARCH_OPTION='-msse4.1' elif [ test "$ax_cv_have_sse3_ext" = yes ]; then # SSE3 extensions @@ -208,6 +269,15 @@ else src/dotprod/src/dotprod_crcf.sse.o \ src/dotprod/src/dotprod_rrrf.sse.o \ src/dotprod/src/sumsq.sse.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.sse.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.sse.o" ARCH_OPTION='-msse3' elif [ test "$ax_cv_have_sse2_ext" = yes ]; then # SSE2 extensions @@ -215,6 +285,15 @@ else src/dotprod/src/dotprod_crcf.sse.o \ src/dotprod/src/dotprod_rrrf.sse.o \ src/dotprod/src/sumsq.sse.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.sse.o" ARCH_OPTION='-msse2' else # portable C version @@ -222,12 +301,30 @@ else src/dotprod/src/dotprod_crcf.o \ src/dotprod/src/dotprod_rrrf.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" fi;; powerpc*) MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.o \ src/dotprod/src/dotprod_rrrf.av.o \ src/dotprod/src/dotprod_crcf.av.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" ARCH_OPTION="-fno-common -faltivec";; armv1*|armv2*|armv3*|armv4*|armv5*|armv6*) # assume neon instructions are NOT available @@ -235,6 +332,15 @@ else src/dotprod/src/dotprod_crcf.o \ src/dotprod/src/dotprod_rrrf.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" ARCH_OPTION="-ffast-math";; arm|armv7*|armv8*) # assume neon instructions are available @@ -245,6 +351,15 @@ else src/dotprod/src/dotprod_crcf.neon.o \ src/dotprod/src/dotprod_rrrf.neon.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" case $target_os in darwin*) # M1 mac, ARM architecture : use neon extensions @@ -260,21 +375,19 @@ else src/dotprod/src/dotprod_crcf.o \ src/dotprod/src/dotprod_rrrf.o \ src/dotprod/src/sumsq.o" + MLIBS_VECTOR="src/vector/src/vectorf_add.o \ + src/vector/src/vectorf_norm.o \ + src/vector/src/vectorf_mul.o \ + src/vector/src/vectorf_trig.o \ + src/vector/src/vectorcf_add.o \ + src/vector/src/vectorcf_norm.o \ + src/vector/src/vectorcf_mul.o \ + src/vector/src/vectorcf_trig.o" + MLIBS_RANDOM="src/random/src/scramble.o" ARCH_OPTION="";; esac fi - -# for now all vector operations are portable C versions -MLIBS_VECTOR="src/vector/src/vectorf_add.port.o \ - src/vector/src/vectorf_norm.port.o \ - src/vector/src/vectorf_mul.port.o \ - src/vector/src/vectorf_trig.port.o \ - src/vector/src/vectorcf_add.port.o \ - src/vector/src/vectorcf_norm.port.o \ - src/vector/src/vectorcf_mul.port.o \ - src/vector/src/vectorcf_trig.port.o" - case $target_os in darwin*) AN_MAKEVAR([LIBTOOL], [AC_PROG_LIBTOOL]) @@ -302,8 +415,9 @@ esac # autoconf variable substitutions # AC_SUBST(LIBS) # shared libraries (-lc, -lm, etc.) -AC_SUBST(MLIBS_DOTPROD) # +AC_SUBST(MLIBS_DOTPROD) # AC_SUBST(MLIBS_VECTOR) # +AC_SUBST(MLIBS_RANDOM) # AC_SUBST(AR_LIB) # archive library AC_SUBST(SH_LIB) # output shared library target diff --git a/makefile.in b/makefile.in index ee3e20c83..82b1a1483 100644 --- a/makefile.in +++ b/makefile.in @@ -208,21 +208,27 @@ src/dotprod/src/sumsq.o : %.o : %.c $(include_headers) # specific machine architectures -# AltiVec -src/dotprod/src/dotprod_rrrf.av.o : %.o : %.c $(include_headers) - -# MMX/SSE2 -src/dotprod/src/dotprod_rrrf.mmx.o : %.o : %.c $(include_headers) -src/dotprod/src/dotprod_crcf.mmx.o : %.o : %.c $(include_headers) -src/dotprod/src/dotprod_cccf.mmx.o : %.o : %.c $(include_headers) - -src/dotprod/src/sumsq.mmx.o : %.o : %.c $(include_headers) +# AVX512F +src/dotprod/src/dotprod_rrrf.avx512f.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_crcf.avx512f.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_cccf.avx512f.o : %.o : %.c $(include_headers) +src/dotprod/src/sumsq.avx512f.o : %.o : %.c $(include_headers) + +# AVX/AVX2 +src/dotprod/src/dotprod_rrrf.avx.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_crcf.avx.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_cccf.avx.o : %.o : %.c $(include_headers) +src/dotprod/src/sumsq.avx.o : %.o : %.c $(include_headers) -# SSE4.1/2 -src/dotprod/src/dotprod_rrrf.sse4.o : %.o : %.c $(include_headers) +# SSE2/SSE3/SSE4.1/SSE4.2 +src/dotprod/src/dotprod_rrrf.sse.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_crcf.sse.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_cccf.sse.o : %.o : %.c $(include_headers) +src/dotprod/src/sumsq.sse.o : %.o : %.c $(include_headers) -# AVX -src/dotprod/src/sumsq.avx.o : %.o : %.c $(include_headers) +# AltiVec +src/dotprod/src/dotprod_rrrf.av.o : %.o : %.c $(include_headers) +src/dotprod/src/dotprod_crcf.av.o : %.o : %.c $(include_headers) # ARM Neon src/dotprod/src/dotprod_rrrf.neon.o : %.o : %.c $(include_headers) @@ -1049,10 +1055,11 @@ quantization_benchmarks := \ src/quantization/bench/quantizer_benchmark.c \ src/quantization/bench/compander_benchmark.c \ -# +# # MODULE : random # +# main objects that only have portable builds random_objects := \ src/random/src/rand.o \ src/random/src/randn.o \ @@ -1061,11 +1068,33 @@ random_objects := \ src/random/src/randgamma.o \ src/random/src/randnakm.o \ src/random/src/randricek.o \ - src/random/src/scramble.o \ - $(random_objects) : %.o : %.c $(include_headers) +# main objects list +random_objects += \ + @MLIBS_RANDOM@ \ + +# portable builds +src/random/src/scramble.o : %.o : %.c $(include_headers) + +# specific machine architectures + +# avx512f +src/random/src/scramble.avx512v.o : %.o : %.c $(include_headers) + +# AVX/AVX2 +src/random/src/scramble.avx.o : %.o : %.c $(include_headers) + +# SSE2/SSE3/SSE4.1/SSE4.2 +src/random/src/scramble.sse.o : %.o : %.c $(include_headers) + +# AltiVec +# TODO... + +# ARM Neon +# TODO... + # autotests random_autotests := \ src/random/tests/scramble_autotest.c \ @@ -1136,17 +1165,31 @@ vector_objects := \ @MLIBS_VECTOR@ \ # portable builds -src/vector/src/vectorf_add.port.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c -src/vector/src/vectorf_norm.port.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c -src/vector/src/vectorf_mul.port.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c -src/vector/src/vectorf_trig.port.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c -src/vector/src/vectorcf_add.port.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c -src/vector/src/vectorcf_norm.port.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c -src/vector/src/vectorcf_mul.port.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c -src/vector/src/vectorcf_trig.port.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c - -# builds for specific architectures -# ... +src/vector/src/vectorf_add.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c +src/vector/src/vectorf_norm.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c +src/vector/src/vectorf_mul.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c +src/vector/src/vectorf_trig.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c +src/vector/src/vectorcf_add.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c +src/vector/src/vectorcf_norm.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c +src/vector/src/vectorcf_mul.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c +src/vector/src/vectorcf_trig.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c + +# specific machine architectures + +# avx512f +src/vector/src/vectorcf_mul.avx512f.o : %.o : %.c $(include_headers) + +# AVX/AVX2 +src/vector/src/vectorcf_mul.avx.o : %.o : %.c $(include_headers) + +# SSE2/SSE3/SSE4.1/SSE4.2 +src/vector/src/vectorcf_mul.sse.o : %.o : %.c $(include_headers) + +# AltiVec +# TODO... + +# ARM Neon +# TODO... # vector autotest scripts vector_autotests := diff --git a/src/random/src/scramble.avx.c b/src/random/src/scramble.avx.c new file mode 100644 index 000000000..fe7d4f45a --- /dev/null +++ b/src/random/src/scramble.avx.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2007 - 2015 Joseph Gaeddert + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// +// Scramble (AVX SIMD version) +// + +#include + +#include "liquid.internal.h" + +void scramble_data(unsigned char * _x, + unsigned int _n) +{ + // t = 32*(floor(_n/32)) + unsigned int t = (_n >> 5) << 5; + + __m256i x; + __m256i mask = _mm256_set_epi8(LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0); + + // apply static masks + unsigned int i; + for (i=0; i> 2) << 2; + + __m256i x; + __m256i y; + __m256i mask = _mm256_set_epi8((LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x80) ? 0xFF : 0); + + // apply static masks + unsigned int i; + for (i=0; i + +#include "liquid.internal.h" + +void scramble_data(unsigned char * _x, + unsigned int _n) +{ + // t = 64*(floor(_n/64)) + unsigned int t = (_n >> 6) << 6; + + __m512i x; + __m512i mask = _mm512_set_epi8(LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0); + + // apply static masks + unsigned int i; + for (i=0; i> 3) << 3; + + __m512i x; + __mmask64 mask = ((__mmask64)liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK3] << 24) | + ((__mmask64)liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK2] << 16) | + ((__mmask64)liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK1] << 8) | + (__mmask64)(liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK0] << 0); + mask |= (mask << 32); + __m512i max = _mm512_set1_epi8(255); + + // apply static masks + unsigned int i; + for (i=0; i #include diff --git a/src/random/src/scramble.sse.c b/src/random/src/scramble.sse.c new file mode 100644 index 000000000..0ee97189c --- /dev/null +++ b/src/random/src/scramble.sse.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2007 - 2015 Joseph Gaeddert + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +// +// Scramble (SSE SIMD version) +// + +#if HAVE_SSE2 +#include +#endif + +#if HAVE_SSE4_1 +#include +#endif + +#include "liquid.internal.h" + +void scramble_data(unsigned char * _x, + unsigned int _n) +{ + // t = 16*(floor(_n/16)) + unsigned int t = (_n >> 4) << 4; + + __m128i x; + __m128i mask = _mm_set_epi8(LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0, + LIQUID_SCRAMBLE_MASK3, LIQUID_SCRAMBLE_MASK2, LIQUID_SCRAMBLE_MASK1, LIQUID_SCRAMBLE_MASK0); + + // apply static masks + unsigned int i; + for (i=0; i> 2) << 2; + + __m128i x; + __m128i y; + __m128i mask01 = _mm_set_epi8((LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x80) ? 0xFF : 0); + __m128i mask23 = _mm_set_epi8((LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x80) ? 0xFF : 0); + + // apply static masks + unsigned int i; + for (i=0; i + +// basic vector multiplication, unrolling loop +// _x : first array [size: _n x 1] +// _y : second array [size: _n x 1] +// _n : array lengths +// _z : output array pointer [size: _n x 1] +void liquid_vectorcf_mul(float complex *_x, + float complex *_y, + unsigned int _n, + float complex *_z) +{ + // type cast as floating point array + float * x = (float*) _x; + float * y = (float*) _y; + float * z = (float*) _z; + + // double effective length + unsigned int n = 2*_n; + + // temporary buffers + __m256 rx, ry, rz; + + // t = 8*(floor(_n/8)) + unsigned int t = (n >> 3) << 3; + + unsigned int i; + for (i=0; i> 3) << 3; + + unsigned int i; + for (i=0; i + +// basic vector multiplication, unrolling loop +// _x : first array [size: _n x 1] +// _y : second array [size: _n x 1] +// _n : array lengths +// _z : output array pointer [size: _n x 1] +void liquid_vectorcf_mul(float complex *_x, + float complex *_y, + unsigned int _n, + float complex *_z) +{ + // type cast as floating point array + float * x = (float*) _x; + float * y = (float*) _y; + float * z = (float*) _z; + + // double effective length + unsigned int n = 2*_n; + + // temporary buffers + __m512 rx, ry, rz; + __m512 one = _mm512_set1_ps(1.0f); + + // t = 16*(floor(_n/16)) + unsigned int t = (n >> 4) << 4; + + unsigned int i; + for (i=0; i> 4) << 4; + + unsigned int i; + for (i=0; i + +// basic vector multiplication, unrolling loop +// _x : first array [size: _n x 1] +// _y : second array [size: _n x 1] +// _n : array lengths +// _z : output array pointer [size: _n x 1] +void liquid_vectorcf_mul(float complex *_x, + float complex *_y, + unsigned int _n, + float complex *_z) +{ + // type cast as floating point array + float * x = (float*) _x; + float * y = (float*) _y; + float * z = (float*) _z; + + // double effective length + unsigned int n = 2*_n; + + // temporary buffers + __m128 rx, ry, rz; + + // t = 4*(floor(_n/4)) + unsigned int t = (n >> 2) << 2; + + unsigned int i; + for (i=0; i> 2) << 2; + + unsigned int i; + for (i=0; i Date: Sun, 19 May 2024 22:47:10 +0100 Subject: [PATCH 2/4] Use lddqu instead of loadu since it may perform better in certain situations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: João Silva --- src/random/src/scramble.avx.c | 4 ++-- src/random/src/scramble.sse.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/random/src/scramble.avx.c b/src/random/src/scramble.avx.c index fe7d4f45a..eb7d1191a 100644 --- a/src/random/src/scramble.avx.c +++ b/src/random/src/scramble.avx.c @@ -47,7 +47,7 @@ void scramble_data(unsigned char * _x, // apply static masks unsigned int i; for (i=0; i Date: Mon, 20 May 2024 13:41:39 +0100 Subject: [PATCH 3/4] Use x XOR 255 to achieve 255-x to make it faster MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: João Silva --- configure.ac | 4 +-- src/random/src/scramble.avx.c | 32 ++++++++++----------- src/random/src/scramble.avx512f.c | 48 +++++++++++++++++-------------- src/random/src/scramble.c | 16 +++++------ src/random/src/scramble.sse.c | 23 +++++++-------- 5 files changed, 62 insertions(+), 61 deletions(-) diff --git a/configure.ac b/configure.ac index d9c3e714f..225fbd96d 100644 --- a/configure.ac +++ b/configure.ac @@ -184,7 +184,7 @@ else AX_EXT if [ test "$ax_cv_have_avx512bw_ext" = yes ]; then - # AVX512 extensions + # AVX512-BW extensions MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx512f.o \ src/dotprod/src/dotprod_crcf.avx512f.o \ src/dotprod/src/dotprod_rrrf.avx512f.o \ @@ -200,7 +200,7 @@ else MLIBS_RANDOM="src/random/src/scramble.avx512f.o" ARCH_OPTION='-mavx512bw' elif [ test "$ax_cv_have_avx512f_ext" = yes ]; then - # AVX512 extensions + # AVX512-F extensions MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx512f.o \ src/dotprod/src/dotprod_crcf.avx512f.o \ src/dotprod/src/dotprod_rrrf.avx512f.o \ diff --git a/src/random/src/scramble.avx.c b/src/random/src/scramble.avx.c index eb7d1191a..f990d5a04 100644 --- a/src/random/src/scramble.avx.c +++ b/src/random/src/scramble.avx.c @@ -79,12 +79,10 @@ void unscramble_data(unsigned char * _x, void unscramble_data_soft(unsigned char * _x, unsigned int _n) { -#if HAVE_AVX2 // t = 4*(floor(_n/4)) unsigned int t = (_n >> 2) << 2; __m256i x; - __m256i y; __m256i mask = _mm256_set_epi8((LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0, @@ -98,17 +96,17 @@ void unscramble_data_soft(unsigned char * _x, unsigned int i; for (i=0; i> 3) << 3; __m512i x; - __mmask64 mask = ((__mmask64)liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK3] << 24) | - ((__mmask64)liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK2] << 16) | - ((__mmask64)liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK1] << 8) | - (__mmask64)(liquid_reverse_byte_gentab[LIQUID_SCRAMBLE_MASK0] << 0); - mask |= (mask << 32); - __m512i max = _mm512_set1_epi8(255); + __m512i mask = _mm512_set_epi8((LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK3 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK3 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK3 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK2 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK2 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0, + (LIQUID_SCRAMBLE_MASK0 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x80) ? 0xFF : 0); // apply static masks unsigned int i; for (i=0; i> 2) << 2; __m128i x; - __m128i y; __m128i mask01 = _mm_set_epi8((LIQUID_SCRAMBLE_MASK1 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x08) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x10) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x20) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x40) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK1 & 0x80) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x01) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x02) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x04) ? 0xFF : 0, (LIQUID_SCRAMBLE_MASK0 & 0x08) ? 0xFF : 0, @@ -97,13 +96,11 @@ void unscramble_data_soft(unsigned char * _x, unsigned int i; for (i=0; i Date: Tue, 4 Jun 2024 19:55:46 +0100 Subject: [PATCH 4/4] Fix typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: João Silva --- makefile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/makefile.in b/makefile.in index 82b1a1483..6780e72bf 100644 --- a/makefile.in +++ b/makefile.in @@ -1081,7 +1081,7 @@ src/random/src/scramble.o : %.o : %.c $(include_headers) # specific machine architectures # avx512f -src/random/src/scramble.avx512v.o : %.o : %.c $(include_headers) +src/random/src/scramble.avx512f.o : %.o : %.c $(include_headers) # AVX/AVX2 src/random/src/scramble.avx.o : %.o : %.c $(include_headers)