From 8ec5ffa32dff5e256811163b75166f7ec241d25e Mon Sep 17 00:00:00 2001 From: Rami ALZEBAK Date: Thu, 24 Nov 2022 16:03:52 +0100 Subject: [PATCH 1/7] Add arm64 optmizations flags --- cmake/VWFlags.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/VWFlags.cmake b/cmake/VWFlags.cmake index 2aeef4c885d..c0da2ce2c3c 100644 --- a/cmake/VWFlags.cmake +++ b/cmake/VWFlags.cmake @@ -18,9 +18,14 @@ if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64") endif() endif() +set(LINUX_ARM64_OPT_FLAGS "") +if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64|arm64|ARM64") + set(LINUX_ARM64_OPT_FLAGS -mcpu=neoverse-n1) +endif() + # Add -ffast-math for speed, remove for testability. # no-stack-check is added to mitigate stack alignment issue on Catalina where there is a bug with aligning stack-check instructions, and stack-check became default option -set(LINUX_RELEASE_CONFIG -fno-strict-aliasing ${LINUX_X86_64_OPT_FLAGS} -fno-stack-check -fomit-frame-pointer) +set(LINUX_RELEASE_CONFIG -fno-strict-aliasing ${LINUX_X86_64_OPT_FLAGS} ${LINUX_ARM64_OPT_FLAGS} -fno-stack-check -fomit-frame-pointer) set(LINUX_DEBUG_CONFIG -fno-stack-check) #Use default visiblity on UNIX otherwise a lot of the C++ symbols end up for exported and interpose'able From ec46d503776929ad86136f2ba3e469632fc500c6 Mon Sep 17 00:00:00 2001 From: Rami ALZEBAK Date: Thu, 24 Nov 2022 16:40:01 +0100 Subject: [PATCH 2/7] Transport SSE intrinsics through sse2neon on ARM --- .gitmodules | 3 +++ ext_libs/sse2neon | 1 + vowpalwabbit/core/src/reductions/lda_core.cc | 13 ++++++++++++- 3 files changed, 16 insertions(+), 1 deletion(-) create mode 160000 ext_libs/sse2neon diff --git a/.gitmodules b/.gitmodules index b35bb3cd648..367d94dc608 100644 --- a/.gitmodules +++ b/.gitmodules @@ -19,3 +19,6 @@ [submodule "ext_libs/vcpkg"] path = ext_libs/vcpkg url = ../../microsoft/vcpkg.git +[submodule "ext_libs/sse2neon"] + path = ext_libs/sse2neon + url = https://github.com/DLTcollab/sse2neon diff --git a/ext_libs/sse2neon b/ext_libs/sse2neon new file mode 160000 index 00000000000..270cf6efbc7 --- /dev/null +++ b/ext_libs/sse2neon @@ -0,0 +1 @@ +Subproject commit 270cf6efbc7efaae1ea017727079e929814b8002 diff --git a/vowpalwabbit/core/src/reductions/lda_core.cc b/vowpalwabbit/core/src/reductions/lda_core.cc index d97a12133dd..2cd9353781e 100644 --- a/vowpalwabbit/core/src/reductions/lda_core.cc +++ b/vowpalwabbit/core/src/reductions/lda_core.cc @@ -32,6 +32,10 @@ VW_WARNING_STATE_POP #include "vw/core/vw_versions.h" #include "vw/io/logger.h" +#if defined(__ARM_NEON) +#include +#endif + #include #include #include @@ -164,7 +168,7 @@ inline float fastdigamma(float x) #if !defined(VW_NO_INLINE_SIMD) -# if defined(__SSE2__) || defined(__SSE3__) || defined(__SSE4_1__) +# if defined(__SSE2__) || defined(__SSE3__) || defined(__SSE4_1__) || defined(__ARM_NEON) namespace { @@ -186,6 +190,13 @@ inline bool is_aligned16(void* ptr) # include # endif +// Transport SSE intrinsics through sse2neon on ARM: +#if defined(__ARM_NEON) +#define __SSE2__ 1 +#define __SSE3__ 1 +#define __SSE4_1__ 1 +#endif + # define HAVE_SIMD_MATHMODE typedef __m128 v4sf; From 8150684e29105635056e087048f2f38579053011 Mon Sep 17 00:00:00 2001 From: Jack Gerrits Date: Fri, 2 Dec 2022 13:13:53 -0500 Subject: [PATCH 3/7] expose sse2neon to cmake --- .gitmodules | 2 +- ext_libs/ext_libs.cmake | 5 +++++ ext_libs/{ => sse2neon}/sse2neon | 0 vowpalwabbit/core/CMakeLists.txt | 2 +- 4 files changed, 7 insertions(+), 2 deletions(-) rename ext_libs/{ => sse2neon}/sse2neon (100%) diff --git a/.gitmodules b/.gitmodules index 367d94dc608..877d742a7c1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -20,5 +20,5 @@ path = ext_libs/vcpkg url = ../../microsoft/vcpkg.git [submodule "ext_libs/sse2neon"] - path = ext_libs/sse2neon + path = ext_libs/sse2neon/sse2neon url = https://github.com/DLTcollab/sse2neon diff --git a/ext_libs/ext_libs.cmake b/ext_libs/ext_libs.cmake index 3f488e27dd6..fa89def0c77 100644 --- a/ext_libs/ext_libs.cmake +++ b/ext_libs/ext_libs.cmake @@ -100,3 +100,8 @@ else() add_library(eigen INTERFACE) target_include_directories(eigen SYSTEM INTERFACE $) endif() + +add_library(sse2neon INTERFACE) +# This submodule is placed into a nested subdirectory since it exposes its +# header at the root of the repo rather than its own nested sse2neon/ dir +target_include_directories(sse2neon SYSTEM INTERFACE "${CMAKE_CURRENT_LIST_DIR}/sse2neon") diff --git a/ext_libs/sse2neon b/ext_libs/sse2neon/sse2neon similarity index 100% rename from ext_libs/sse2neon rename to ext_libs/sse2neon/sse2neon diff --git a/vowpalwabbit/core/CMakeLists.txt b/vowpalwabbit/core/CMakeLists.txt index 0715ab3abd7..efe9dc0cb38 100644 --- a/vowpalwabbit/core/CMakeLists.txt +++ b/vowpalwabbit/core/CMakeLists.txt @@ -362,7 +362,7 @@ vw_add_library( # Use BUILD_INTERFACE to prevent them from being exported, i.e. treat them as PRIVATE # https://gitlab.kitware.com/cmake/cmake/issues/15415 ${CMAKE_DL_LIBS} ${LINK_THREADS} vw_io $ $ - $ + $ $ DESCRIPTION "This contains all remaining VW code, all reduction implementations, driver, option handling" EXCEPTION_DESCRIPTION "Yes" ENABLE_INSTALL From ee6a4315b31c95276b4fbe747b5323b74c248b10 Mon Sep 17 00:00:00 2001 From: Jack Gerrits Date: Fri, 9 Dec 2022 11:24:53 -0500 Subject: [PATCH 4/7] Update VWFlags.cmake --- cmake/VWFlags.cmake | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cmake/VWFlags.cmake b/cmake/VWFlags.cmake index c0da2ce2c3c..2aeef4c885d 100644 --- a/cmake/VWFlags.cmake +++ b/cmake/VWFlags.cmake @@ -18,14 +18,9 @@ if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64") endif() endif() -set(LINUX_ARM64_OPT_FLAGS "") -if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64|arm64|ARM64") - set(LINUX_ARM64_OPT_FLAGS -mcpu=neoverse-n1) -endif() - # Add -ffast-math for speed, remove for testability. # no-stack-check is added to mitigate stack alignment issue on Catalina where there is a bug with aligning stack-check instructions, and stack-check became default option -set(LINUX_RELEASE_CONFIG -fno-strict-aliasing ${LINUX_X86_64_OPT_FLAGS} ${LINUX_ARM64_OPT_FLAGS} -fno-stack-check -fomit-frame-pointer) +set(LINUX_RELEASE_CONFIG -fno-strict-aliasing ${LINUX_X86_64_OPT_FLAGS} -fno-stack-check -fomit-frame-pointer) set(LINUX_DEBUG_CONFIG -fno-stack-check) #Use default visiblity on UNIX otherwise a lot of the C++ symbols end up for exported and interpose'able From 941c896a720d33a6850774b3045c3223530b1ed5 Mon Sep 17 00:00:00 2001 From: zwd-ms <71728747+zwd-ms@users.noreply.github.com> Date: Mon, 12 Dec 2022 17:46:43 -0500 Subject: [PATCH 5/7] Update lda_core.cc fix format --- vowpalwabbit/core/src/reductions/lda_core.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vowpalwabbit/core/src/reductions/lda_core.cc b/vowpalwabbit/core/src/reductions/lda_core.cc index 3e24f96860d..93b05986a45 100644 --- a/vowpalwabbit/core/src/reductions/lda_core.cc +++ b/vowpalwabbit/core/src/reductions/lda_core.cc @@ -191,11 +191,11 @@ inline bool is_aligned16(void* ptr) # endif // Transport SSE intrinsics through sse2neon on ARM: -#if defined(__ARM_NEON) -#define __SSE2__ 1 -#define __SSE3__ 1 -#define __SSE4_1__ 1 -#endif +# if defined(__ARM_NEON) +# define __SSE2__ 1 +# define __SSE3__ 1 +# define __SSE4_1__ 1 +# endif # define HAVE_SIMD_MATHMODE From 01c560c4eecdbb00791ff4b9d0b395e98a975a87 Mon Sep 17 00:00:00 2001 From: zwd-ms <71728747+zwd-ms@users.noreply.github.com> Date: Mon, 12 Dec 2022 17:50:38 -0500 Subject: [PATCH 6/7] Update lda_core.cc --- vowpalwabbit/core/src/reductions/lda_core.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vowpalwabbit/core/src/reductions/lda_core.cc b/vowpalwabbit/core/src/reductions/lda_core.cc index 93b05986a45..33930fe1c78 100644 --- a/vowpalwabbit/core/src/reductions/lda_core.cc +++ b/vowpalwabbit/core/src/reductions/lda_core.cc @@ -33,7 +33,7 @@ VW_WARNING_STATE_POP #include "vw/io/logger.h" #if defined(__ARM_NEON) -#include + include #endif #include From ba88bf31ed0395ba89d8efd109f51e9dfc4f9de9 Mon Sep 17 00:00:00 2001 From: zwd-ms <71728747+zwd-ms@users.noreply.github.com> Date: Mon, 12 Dec 2022 17:54:49 -0500 Subject: [PATCH 7/7] Update lda_core.cc --- vowpalwabbit/core/src/reductions/lda_core.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vowpalwabbit/core/src/reductions/lda_core.cc b/vowpalwabbit/core/src/reductions/lda_core.cc index 33930fe1c78..16c63201070 100644 --- a/vowpalwabbit/core/src/reductions/lda_core.cc +++ b/vowpalwabbit/core/src/reductions/lda_core.cc @@ -33,7 +33,7 @@ VW_WARNING_STATE_POP #include "vw/io/logger.h" #if defined(__ARM_NEON) - include +# include #endif #include