diff --git a/.gitmodules b/.gitmodules index b35bb3cd648..877d742a7c1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -19,3 +19,6 @@ [submodule "ext_libs/vcpkg"] path = ext_libs/vcpkg url = ../../microsoft/vcpkg.git +[submodule "ext_libs/sse2neon"] + path = ext_libs/sse2neon/sse2neon + url = https://github.com/DLTcollab/sse2neon diff --git a/ext_libs/ext_libs.cmake b/ext_libs/ext_libs.cmake index 6b62a9748f4..331631d0468 100644 --- a/ext_libs/ext_libs.cmake +++ b/ext_libs/ext_libs.cmake @@ -104,3 +104,8 @@ else() add_library(eigen INTERFACE) target_include_directories(eigen SYSTEM INTERFACE $) endif() + +add_library(sse2neon INTERFACE) +# This submodule is placed into a nested subdirectory since it exposes its +# header at the root of the repo rather than its own nested sse2neon/ dir +target_include_directories(sse2neon SYSTEM INTERFACE "${CMAKE_CURRENT_LIST_DIR}/sse2neon") diff --git a/ext_libs/sse2neon/sse2neon b/ext_libs/sse2neon/sse2neon new file mode 160000 index 00000000000..270cf6efbc7 --- /dev/null +++ b/ext_libs/sse2neon/sse2neon @@ -0,0 +1 @@ +Subproject commit 270cf6efbc7efaae1ea017727079e929814b8002 diff --git a/vowpalwabbit/core/CMakeLists.txt b/vowpalwabbit/core/CMakeLists.txt index d44918e10be..9b9b73595d5 100644 --- a/vowpalwabbit/core/CMakeLists.txt +++ b/vowpalwabbit/core/CMakeLists.txt @@ -364,7 +364,7 @@ vw_add_library( # Use BUILD_INTERFACE to prevent them from being exported, i.e. treat them as PRIVATE # https://gitlab.kitware.com/cmake/cmake/issues/15415 ${CMAKE_DL_LIBS} ${LINK_THREADS} vw_io $ $ - $ + $ $ DESCRIPTION "This contains all remaining VW code, all reduction implementations, driver, option handling" EXCEPTION_DESCRIPTION "Yes" ENABLE_INSTALL diff --git a/vowpalwabbit/core/src/reductions/lda_core.cc b/vowpalwabbit/core/src/reductions/lda_core.cc index 161c719f4d0..16c63201070 100644 --- a/vowpalwabbit/core/src/reductions/lda_core.cc +++ b/vowpalwabbit/core/src/reductions/lda_core.cc @@ -32,6 +32,10 @@ VW_WARNING_STATE_POP #include "vw/core/vw_versions.h" #include "vw/io/logger.h" +#if defined(__ARM_NEON) +# include +#endif + #include #include #include @@ -164,7 +168,7 @@ inline float fastdigamma(float x) #if !defined(VW_NO_INLINE_SIMD) -# if defined(__SSE2__) || defined(__SSE3__) || defined(__SSE4_1__) +# if defined(__SSE2__) || defined(__SSE3__) || defined(__SSE4_1__) || defined(__ARM_NEON) namespace { @@ -186,6 +190,13 @@ inline bool is_aligned16(void* ptr) # include # endif +// Transport SSE intrinsics through sse2neon on ARM: +# if defined(__ARM_NEON) +# define __SSE2__ 1 +# define __SSE3__ 1 +# define __SSE4_1__ 1 +# endif + # define HAVE_SIMD_MATHMODE typedef __m128 v4sf;