Skip to content

Commit

Permalink
Add SIMD versions of scrambler and vector multiplication
Browse files Browse the repository at this point in the history
Signed-off-by: João Silva <jgc3silva@gmail.com>
  • Loading branch information
vankxr committed May 19, 2024
1 parent 030b5b4 commit 6af7fdf
Show file tree
Hide file tree
Showing 18 changed files with 1,832 additions and 41 deletions.
850 changes: 850 additions & 0 deletions benchmark.json

Large diffs are not rendered by default.

140 changes: 127 additions & 13 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,15 @@ if test "${enable_simdoverride+set}" = set; then
src/dotprod/src/dotprod_crcf.o \
src/dotprod/src/dotprod_rrrf.o \
src/dotprod/src/sumsq.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.o"
ARCH_OPTION=""
else
# Check canonical system
Expand All @@ -174,67 +183,164 @@ else
# AVX512 : immintrin.h
AX_EXT

if [ test "$ax_cv_have_avx512f_ext" = yes ]; then
if [ test "$ax_cv_have_avx512bw_ext" = yes ]; then
# AVX512 extensions
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx512f.o \
src/dotprod/src/dotprod_crcf.avx512f.o \
src/dotprod/src/dotprod_rrrf.avx512f.o \
src/dotprod/src/sumsq.avx512f.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.avx512f.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.avx512f.o"
ARCH_OPTION='-mavx512bw'
elif [ test "$ax_cv_have_avx512f_ext" = yes ]; then
# AVX512 extensions
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx512f.o \
src/dotprod/src/dotprod_crcf.avx512f.o \
src/dotprod/src/dotprod_rrrf.avx512f.o \
src/dotprod/src/sumsq.avx512f.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.avx512f.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.avx512f.o"
ARCH_OPTION='-mavx512f'
elif [ test "$ax_cv_have_avx2_ext" = yes ]; then
# AVX2 extensions
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx.o \
src/dotprod/src/dotprod_crcf.avx.o \
src/dotprod/src/dotprod_rrrf.avx.o \
src/dotprod/src/sumsq.avx.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.avx.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.avx.o"
ARCH_OPTION='-mavx2'
elif [ test "$ax_cv_have_avx_ext" = yes ]; then
# AVX extensions
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.avx.o \
src/dotprod/src/dotprod_crcf.avx.o \
src/dotprod/src/dotprod_rrrf.avx.o \
src/dotprod/src/sumsq.avx.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.avx.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.sse.o"
ARCH_OPTION='-mavx'
elif [ test "$ax_cv_have_sse41_ext" = yes ]; then
# SSE4.1/2 extensions
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.sse.o \
src/dotprod/src/dotprod_crcf.sse.o \
src/dotprod/src/dotprod_rrrf.sse.o \
src/dotprod/src/sumsq.sse.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.sse.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.sse.o"
ARCH_OPTION='-msse4.1'
elif [ test "$ax_cv_have_sse3_ext" = yes ]; then
# SSE3 extensions
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.sse.o \
src/dotprod/src/dotprod_crcf.sse.o \
src/dotprod/src/dotprod_rrrf.sse.o \
src/dotprod/src/sumsq.sse.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.sse.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.sse.o"
ARCH_OPTION='-msse3'
elif [ test "$ax_cv_have_sse2_ext" = yes ]; then
# SSE2 extensions
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.sse.o \
src/dotprod/src/dotprod_crcf.sse.o \
src/dotprod/src/dotprod_rrrf.sse.o \
src/dotprod/src/sumsq.sse.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.sse.o"
ARCH_OPTION='-msse2'
else
# portable C version
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.o \
src/dotprod/src/dotprod_crcf.o \
src/dotprod/src/dotprod_rrrf.o \
src/dotprod/src/sumsq.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.o"
fi;;
powerpc*)
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.o \
src/dotprod/src/dotprod_rrrf.av.o \
src/dotprod/src/dotprod_crcf.av.o \
src/dotprod/src/sumsq.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.o"
ARCH_OPTION="-fno-common -faltivec";;
armv1*|armv2*|armv3*|armv4*|armv5*|armv6*)
# assume neon instructions are NOT available
MLIBS_DOTPROD="src/dotprod/src/dotprod_cccf.o \
src/dotprod/src/dotprod_crcf.o \
src/dotprod/src/dotprod_rrrf.o \
src/dotprod/src/sumsq.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.o"
ARCH_OPTION="-ffast-math";;
arm|armv7*|armv8*)
# assume neon instructions are available
Expand All @@ -245,6 +351,15 @@ else
src/dotprod/src/dotprod_crcf.neon.o \
src/dotprod/src/dotprod_rrrf.neon.o \
src/dotprod/src/sumsq.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.o"
case $target_os in
darwin*)
# M1 mac, ARM architecture : use neon extensions
Expand All @@ -260,21 +375,19 @@ else
src/dotprod/src/dotprod_crcf.o \
src/dotprod/src/dotprod_rrrf.o \
src/dotprod/src/sumsq.o"
MLIBS_VECTOR="src/vector/src/vectorf_add.o \
src/vector/src/vectorf_norm.o \
src/vector/src/vectorf_mul.o \
src/vector/src/vectorf_trig.o \
src/vector/src/vectorcf_add.o \
src/vector/src/vectorcf_norm.o \
src/vector/src/vectorcf_mul.o \
src/vector/src/vectorcf_trig.o"
MLIBS_RANDOM="src/random/src/scramble.o"
ARCH_OPTION="";;
esac
fi


# for now all vector operations are portable C versions
MLIBS_VECTOR="src/vector/src/vectorf_add.port.o \
src/vector/src/vectorf_norm.port.o \
src/vector/src/vectorf_mul.port.o \
src/vector/src/vectorf_trig.port.o \
src/vector/src/vectorcf_add.port.o \
src/vector/src/vectorcf_norm.port.o \
src/vector/src/vectorcf_mul.port.o \
src/vector/src/vectorcf_trig.port.o"

case $target_os in
darwin*)
AN_MAKEVAR([LIBTOOL], [AC_PROG_LIBTOOL])
Expand Down Expand Up @@ -302,8 +415,9 @@ esac
# autoconf variable substitutions
#
AC_SUBST(LIBS) # shared libraries (-lc, -lm, etc.)
AC_SUBST(MLIBS_DOTPROD) #
AC_SUBST(MLIBS_DOTPROD) #
AC_SUBST(MLIBS_VECTOR) #
AC_SUBST(MLIBS_RANDOM) #

AC_SUBST(AR_LIB) # archive library
AC_SUBST(SH_LIB) # output shared library target
Expand Down
97 changes: 70 additions & 27 deletions makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -208,21 +208,27 @@ src/dotprod/src/sumsq.o : %.o : %.c $(include_headers)

# specific machine architectures

# AltiVec
src/dotprod/src/dotprod_rrrf.av.o : %.o : %.c $(include_headers)

# MMX/SSE2
src/dotprod/src/dotprod_rrrf.mmx.o : %.o : %.c $(include_headers)
src/dotprod/src/dotprod_crcf.mmx.o : %.o : %.c $(include_headers)
src/dotprod/src/dotprod_cccf.mmx.o : %.o : %.c $(include_headers)

src/dotprod/src/sumsq.mmx.o : %.o : %.c $(include_headers)
# AVX512F
src/dotprod/src/dotprod_rrrf.avx512f.o : %.o : %.c $(include_headers)
src/dotprod/src/dotprod_crcf.avx512f.o : %.o : %.c $(include_headers)
src/dotprod/src/dotprod_cccf.avx512f.o : %.o : %.c $(include_headers)
src/dotprod/src/sumsq.avx512f.o : %.o : %.c $(include_headers)

# AVX/AVX2
src/dotprod/src/dotprod_rrrf.avx.o : %.o : %.c $(include_headers)
src/dotprod/src/dotprod_crcf.avx.o : %.o : %.c $(include_headers)
src/dotprod/src/dotprod_cccf.avx.o : %.o : %.c $(include_headers)
src/dotprod/src/sumsq.avx.o : %.o : %.c $(include_headers)

# SSE4.1/2
src/dotprod/src/dotprod_rrrf.sse4.o : %.o : %.c $(include_headers)
# SSE2/SSE3/SSE4.1/SSE4.2
src/dotprod/src/dotprod_rrrf.sse.o : %.o : %.c $(include_headers)
src/dotprod/src/dotprod_crcf.sse.o : %.o : %.c $(include_headers)
src/dotprod/src/dotprod_cccf.sse.o : %.o : %.c $(include_headers)
src/dotprod/src/sumsq.sse.o : %.o : %.c $(include_headers)

# AVX
src/dotprod/src/sumsq.avx.o : %.o : %.c $(include_headers)
# AltiVec
src/dotprod/src/dotprod_rrrf.av.o : %.o : %.c $(include_headers)
src/dotprod/src/dotprod_crcf.av.o : %.o : %.c $(include_headers)

# ARM Neon
src/dotprod/src/dotprod_rrrf.neon.o : %.o : %.c $(include_headers)
Expand Down Expand Up @@ -1048,10 +1054,11 @@ quantization_benchmarks := \
src/quantization/bench/quantizer_benchmark.c \
src/quantization/bench/compander_benchmark.c \

#
#
# MODULE : random
#

# main objects that only have portable builds
random_objects := \
src/random/src/rand.o \
src/random/src/randn.o \
Expand All @@ -1060,11 +1067,33 @@ random_objects := \
src/random/src/randgamma.o \
src/random/src/randnakm.o \
src/random/src/randricek.o \
src/random/src/scramble.o \


$(random_objects) : %.o : %.c $(include_headers)

# main objects list
random_objects += \
@MLIBS_RANDOM@ \

# portable builds
src/random/src/scramble.o : %.o : %.c $(include_headers)

# specific machine architectures

# avx512f
src/random/src/scramble.avx512v.o : %.o : %.c $(include_headers)

# AVX/AVX2
src/random/src/scramble.avx.o : %.o : %.c $(include_headers)

# SSE2/SSE3/SSE4.1/SSE4.2
src/random/src/scramble.sse.o : %.o : %.c $(include_headers)

# AltiVec
# TODO...

# ARM Neon
# TODO...

# autotests
random_autotests := \
src/random/tests/scramble_autotest.c \
Expand Down Expand Up @@ -1135,17 +1164,31 @@ vector_objects := \
@MLIBS_VECTOR@ \

# portable builds
src/vector/src/vectorf_add.port.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c
src/vector/src/vectorf_norm.port.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c
src/vector/src/vectorf_mul.port.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c
src/vector/src/vectorf_trig.port.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c
src/vector/src/vectorcf_add.port.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c
src/vector/src/vectorcf_norm.port.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c
src/vector/src/vectorcf_mul.port.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c
src/vector/src/vectorcf_trig.port.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c

# builds for specific architectures
# ...
src/vector/src/vectorf_add.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c
src/vector/src/vectorf_norm.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c
src/vector/src/vectorf_mul.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c
src/vector/src/vectorf_trig.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c
src/vector/src/vectorcf_add.o : %.o : %.c $(include_headers) src/vector/src/vector_add.proto.c
src/vector/src/vectorcf_norm.o : %.o : %.c $(include_headers) src/vector/src/vector_norm.proto.c
src/vector/src/vectorcf_mul.o : %.o : %.c $(include_headers) src/vector/src/vector_mul.proto.c
src/vector/src/vectorcf_trig.o : %.o : %.c $(include_headers) src/vector/src/vector_trig.proto.c

# specific machine architectures

# avx512f
src/vector/src/vectorcf_mul.avx512f.o : %.o : %.c $(include_headers)

# AVX/AVX2
src/vector/src/vectorcf_mul.avx.o : %.o : %.c $(include_headers)

# SSE2/SSE3/SSE4.1/SSE4.2
src/vector/src/vectorcf_mul.sse.o : %.o : %.c $(include_headers)

# AltiVec
# TODO...

# ARM Neon
# TODO...

# vector autotest scripts
vector_autotests :=
Expand Down
Loading

0 comments on commit 6af7fdf

Please sign in to comment.